diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab0ae1ba49f17c446b66e627e5e96aa2c97bb02d --- /dev/null +++ b/checkpoint-10000/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "checkpoints/vlm_dc-vae-f32c32-sana-1.1_layerwise-0_group-7/checkpoint-9250", + "ar_steps": 1, + "architectures": [ + "DiffVLMDiffusion" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "condition_layer": -1, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "image_token_id": 151655, + "img_cross_attention_dim": 2048, + "img_diffuser_depth": 6, + "img_ffn_dim_multiplier": null, + "img_hidden_size": 1536, + "img_multiple_of": 256, + "img_norm_eps": 1e-05, + "img_num_attention_heads": 12, + "img_num_kv_heads": 12, + "img_qk_norm": true, + "in_channels": 32, + "initializer_range": 0.02, + "inject_img_diffuser": false, + "input_size": 32, + "intermediate_size": 8960, + "layer_group_size": 7, + "layerwise_start_idx": 0, + "lora_alpha": 256, + "lora_bias": "none", + "lora_dropout": 0.05, + "lora_enable": false, + "lora_r": 128, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_vl", + "non_linearity": 1, + "norm_elementwise_affine": true, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "patch_size": 2, + "repa_coeff": 0.1, + "repa_layers": "2", + "repa_shared": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sample_size": 128, + "sampling_steps": 28, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.47.0", + "use_cache": true, + "use_repa": false, + "use_residual_attn": false, + "use_sliding_window": false, + "vae_path": "mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers", + "video_token_id": 151656, + "vision_config": { + "hidden_size": 1536, + "in_chans": 3, + "model_type": "qwen2_vl", + "spatial_patch_size": 14 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/checkpoint-10000/generation_config.json b/checkpoint-10000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3d465da90cb0ab59d8ba7babdefb0e88fbfa6b --- /dev/null +++ b/checkpoint-10000/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": 151645, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-10000/model-00001-of-00002.safetensors b/checkpoint-10000/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db4d55a2f491b5c89d25989f6929599a021871b5 --- /dev/null +++ b/checkpoint-10000/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f21f90bc523b73cdf3a7ba5a0d46ba708b881833462021e9016920499b54ec +size 4998598816 diff --git a/checkpoint-10000/model-00002-of-00002.safetensors b/checkpoint-10000/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20e9aa8cab4dc96bad5e8d6c358b75a126894345 --- /dev/null +++ b/checkpoint-10000/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8afd357e3018d21f3ff9af1e218a6a120787a468edfab59d679e9d96a5c555 +size 4990560652 diff --git a/checkpoint-10000/model.safetensors.index.json b/checkpoint-10000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..b3b85b852967adb370204fb2c3e3d18822b10ab5 --- /dev/null +++ b/checkpoint-10000/model.safetensors.index.json @@ -0,0 +1,1740 @@ +{ + "metadata": { + "total_size": 9988962252 + }, + "weight_map": { + "embed_tokens.weight": "model-00002-of-00002.safetensors", + "img2text.bias": "model-00001-of-00002.safetensors", + "img2text.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.weight": "model-00001-of-00002.safetensors", + "layers.0.gate": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.1.gate": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.10.gate": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.11.gate": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.12.gate": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.13.gate": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.14.gate": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.15.gate": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.16.gate": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.17.gate": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.17.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.17.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.18.gate": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.19.gate": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.2.gate": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.20.gate": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.21.gate": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.22.gate": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.23.gate": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.24.gate": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.25.gate": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.26.gate": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.27.gate": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.3.gate": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.4.gate": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.5.gate": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.6.gate": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.7.gate": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.8.gate": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.9.gate": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "norm.weight": "model-00002-of-00002.safetensors", + "patch_embedder.proj.bias": "model-00001-of-00002.safetensors", + "patch_embedder.proj.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.weight": "model-00001-of-00002.safetensors", + "vae.decoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-10000/optimizer.pt b/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9ce90184030e2e1784cc3973f0841cf3dd6a99c --- /dev/null +++ b/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18f8e9a11b8d0f6bec1300daed7ebdb91852500bf45cc1150d035fea35a9bfc3 +size 15084326534 diff --git a/checkpoint-10000/rng_state_0.pth b/checkpoint-10000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7393cce6bbe85ca6bba774eadb160a67b331c576 --- /dev/null +++ b/checkpoint-10000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356296cf368727b0f14f082cd6d973d3746f68721e1ff72d34851f878f705455 +size 15984 diff --git a/checkpoint-10000/rng_state_1.pth b/checkpoint-10000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b97bf86501fa0389eb466468a59e92d70843b82 --- /dev/null +++ b/checkpoint-10000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6507554a7218dc304b1d5b77aa78afd492b4ce9f82845bd0c14acdc898133fbf +size 15984 diff --git a/checkpoint-10000/rng_state_2.pth b/checkpoint-10000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c44f157bd428718e373894a56a8ab8a6853c5eb --- /dev/null +++ b/checkpoint-10000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9622fc69e28136febe7fc4bda8096f9be9ef00f6fc3ed53e550c7b9bf24e75 +size 15984 diff --git a/checkpoint-10000/rng_state_3.pth b/checkpoint-10000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3eacbfa5644dd1c8926cc5a5fc23038a4873672a --- /dev/null +++ b/checkpoint-10000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e6256cbebb72b61aec386e2001c99eca50cd64e3caabed87abbaecab473bfe3 +size 15984 diff --git a/checkpoint-10000/rng_state_4.pth b/checkpoint-10000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ac0ed8f55f5eb54d210008661b4e05dbc9ff380 --- /dev/null +++ b/checkpoint-10000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b910495639901762f951a102243becc7b7c22c9ecf9ac605b3dbaad22daf19 +size 15984 diff --git a/checkpoint-10000/rng_state_5.pth b/checkpoint-10000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1346b3c0c2b96ac372e12d3723c9db15987fb554 --- /dev/null +++ b/checkpoint-10000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e54c58e07ae08f4525f416fcf26574303a9933be1edf113de699e25b9e3f03 +size 15984 diff --git a/checkpoint-10000/rng_state_6.pth b/checkpoint-10000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2ff0408bc38c10d109a7fa3fc326e85c719ce30b --- /dev/null +++ b/checkpoint-10000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc49d27e834b3f570286bef728f6f2fd05bb41e813fe9f2c8cd0474eb34e174 +size 15984 diff --git a/checkpoint-10000/rng_state_7.pth b/checkpoint-10000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3968ada279c7015e96a2c00986da43d5d62ef69 --- /dev/null +++ b/checkpoint-10000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7c069e906f94302cd351091969433b01cefe19edc1587235be42f669aa2103 +size 15984 diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2f4899af64fa43bdff908adbe884dacaf212a13 --- /dev/null +++ b/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cecba43d7459ec5ce9f816c2e7f8c9a6c0974f0e469d7bde6bc33484cab04b5 +size 1064 diff --git a/checkpoint-10000/trainer_state.json b/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..22d03df2411f6ccc085f4fc33ddc6e698f9542f0 --- /dev/null +++ b/checkpoint-10000/trainer_state.json @@ -0,0 +1,70033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.987220447284345, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007987220447284345, + "grad_norm": 0.08758673816919327, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1 + }, + { + "epoch": 0.001597444089456869, + "grad_norm": 2.9034857749938965, + "learning_rate": 0.0005, + "loss": 1.5342, + "step": 2 + }, + { + "epoch": 0.0023961661341853034, + "grad_norm": 1.260856032371521, + "learning_rate": 0.0005, + "loss": 1.3074, + "step": 3 + }, + { + "epoch": 0.003194888178913738, + "grad_norm": 2.2480077743530273, + "learning_rate": 0.0005, + "loss": 1.3434, + "step": 4 + }, + { + "epoch": 0.003993610223642172, + "grad_norm": 0.6822420358657837, + "learning_rate": 0.0005, + "loss": 1.2075, + "step": 5 + }, + { + "epoch": 0.004792332268370607, + "grad_norm": 0.7826036214828491, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 6 + }, + { + "epoch": 0.005591054313099041, + "grad_norm": 0.690284788608551, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 7 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 0.49136775732040405, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 8 + }, + { + "epoch": 0.00718849840255591, + "grad_norm": 0.3124309182167053, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 9 + }, + { + "epoch": 0.007987220447284345, + "grad_norm": 0.3409576714038849, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 10 + }, + { + "epoch": 0.00878594249201278, + "grad_norm": 0.25508174300193787, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 11 + }, + { + "epoch": 0.009584664536741214, + "grad_norm": 0.19042040407657623, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 12 + }, + { + "epoch": 0.010383386581469648, + "grad_norm": 0.2090323120355606, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 13 + }, + { + "epoch": 0.011182108626198083, + "grad_norm": 0.2102068066596985, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 14 + }, + { + "epoch": 0.011980830670926517, + "grad_norm": 0.12789177894592285, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 15 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 0.10204717516899109, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 16 + }, + { + "epoch": 0.013578274760383386, + "grad_norm": 0.174830362200737, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 17 + }, + { + "epoch": 0.01437699680511182, + "grad_norm": 0.25637468695640564, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 18 + }, + { + "epoch": 0.015175718849840255, + "grad_norm": 0.28002411127090454, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 19 + }, + { + "epoch": 0.01597444089456869, + "grad_norm": 0.23047354817390442, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 20 + }, + { + "epoch": 0.016773162939297124, + "grad_norm": 0.1548614650964737, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 21 + }, + { + "epoch": 0.01757188498402556, + "grad_norm": 0.07078541815280914, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 22 + }, + { + "epoch": 0.018370607028753993, + "grad_norm": 0.10615550726652145, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 23 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 0.10240291804075241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 24 + }, + { + "epoch": 0.019968051118210862, + "grad_norm": 0.07588993012905121, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 25 + }, + { + "epoch": 0.020766773162939296, + "grad_norm": 0.06380276381969452, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 26 + }, + { + "epoch": 0.02156549520766773, + "grad_norm": 0.06891524791717529, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 27 + }, + { + "epoch": 0.022364217252396165, + "grad_norm": 0.0625377744436264, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 28 + }, + { + "epoch": 0.0231629392971246, + "grad_norm": 0.12064792215824127, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 29 + }, + { + "epoch": 0.023961661341853034, + "grad_norm": 0.29220151901245117, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 30 + }, + { + "epoch": 0.02476038338658147, + "grad_norm": 0.7822219729423523, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 31 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 1.5172864198684692, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 32 + }, + { + "epoch": 0.026357827476038338, + "grad_norm": 0.18434809148311615, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 33 + }, + { + "epoch": 0.027156549520766772, + "grad_norm": 0.535632848739624, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 34 + }, + { + "epoch": 0.027955271565495207, + "grad_norm": 0.21549028158187866, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 35 + }, + { + "epoch": 0.02875399361022364, + "grad_norm": 0.4726889431476593, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 36 + }, + { + "epoch": 0.029552715654952075, + "grad_norm": 0.2519988417625427, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 37 + }, + { + "epoch": 0.03035143769968051, + "grad_norm": 0.2973701059818268, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 38 + }, + { + "epoch": 0.031150159744408944, + "grad_norm": 0.30153587460517883, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 39 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 0.08746712654829025, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 40 + }, + { + "epoch": 0.03274760383386582, + "grad_norm": 0.3308769762516022, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 41 + }, + { + "epoch": 0.03354632587859425, + "grad_norm": 0.10948555171489716, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 42 + }, + { + "epoch": 0.034345047923322686, + "grad_norm": 0.3044797480106354, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 43 + }, + { + "epoch": 0.03514376996805112, + "grad_norm": 0.11677752435207367, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 44 + }, + { + "epoch": 0.035942492012779555, + "grad_norm": 0.30327609181404114, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 45 + }, + { + "epoch": 0.036741214057507986, + "grad_norm": 0.10603009909391403, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 46 + }, + { + "epoch": 0.037539936102236424, + "grad_norm": 0.2693077623844147, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 47 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.11918680369853973, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 48 + }, + { + "epoch": 0.03913738019169329, + "grad_norm": 0.2965734899044037, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 49 + }, + { + "epoch": 0.039936102236421724, + "grad_norm": 0.10428953915834427, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 50 + }, + { + "epoch": 0.04073482428115016, + "grad_norm": 0.23307208716869354, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 51 + }, + { + "epoch": 0.04153354632587859, + "grad_norm": 0.07401563227176666, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 52 + }, + { + "epoch": 0.04233226837060703, + "grad_norm": 0.22344312071800232, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 53 + }, + { + "epoch": 0.04313099041533546, + "grad_norm": 0.1782081127166748, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 54 + }, + { + "epoch": 0.0439297124600639, + "grad_norm": 0.10123606026172638, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 55 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 0.2618716359138489, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 56 + }, + { + "epoch": 0.04552715654952077, + "grad_norm": 0.15046533942222595, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 57 + }, + { + "epoch": 0.0463258785942492, + "grad_norm": 0.1341097205877304, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 58 + }, + { + "epoch": 0.04712460063897764, + "grad_norm": 0.20391245186328888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 59 + }, + { + "epoch": 0.04792332268370607, + "grad_norm": 0.09610722959041595, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 60 + }, + { + "epoch": 0.048722044728434506, + "grad_norm": 0.09877557307481766, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 61 + }, + { + "epoch": 0.04952076677316294, + "grad_norm": 0.16971156001091003, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 62 + }, + { + "epoch": 0.050319488817891375, + "grad_norm": 0.1819174885749817, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 63 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.13067278265953064, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 64 + }, + { + "epoch": 0.051916932907348244, + "grad_norm": 0.10557633638381958, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 65 + }, + { + "epoch": 0.052715654952076675, + "grad_norm": 0.08713806420564651, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 66 + }, + { + "epoch": 0.05351437699680511, + "grad_norm": 0.12453104555606842, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 67 + }, + { + "epoch": 0.054313099041533544, + "grad_norm": 0.19147996604442596, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 68 + }, + { + "epoch": 0.05511182108626198, + "grad_norm": 0.21808673441410065, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 69 + }, + { + "epoch": 0.05591054313099041, + "grad_norm": 0.15922780334949493, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 70 + }, + { + "epoch": 0.05670926517571885, + "grad_norm": 0.09400095790624619, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 71 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 0.071605384349823, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 72 + }, + { + "epoch": 0.05830670926517572, + "grad_norm": 0.08754080533981323, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 73 + }, + { + "epoch": 0.05910543130990415, + "grad_norm": 0.07777409255504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 74 + }, + { + "epoch": 0.05990415335463259, + "grad_norm": 0.04577887803316116, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 75 + }, + { + "epoch": 0.06070287539936102, + "grad_norm": 0.07278449088335037, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 76 + }, + { + "epoch": 0.06150159744408946, + "grad_norm": 0.06739042699337006, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 77 + }, + { + "epoch": 0.06230031948881789, + "grad_norm": 0.06367938220500946, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 78 + }, + { + "epoch": 0.06309904153354633, + "grad_norm": 0.0551401786506176, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 79 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.04846199229359627, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 80 + }, + { + "epoch": 0.06469648562300319, + "grad_norm": 0.089615598320961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 81 + }, + { + "epoch": 0.06549520766773163, + "grad_norm": 0.19073566794395447, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 82 + }, + { + "epoch": 0.06629392971246006, + "grad_norm": 0.26971691846847534, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 83 + }, + { + "epoch": 0.0670926517571885, + "grad_norm": 0.3124604821205139, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 84 + }, + { + "epoch": 0.06789137380191693, + "grad_norm": 0.3448403775691986, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 85 + }, + { + "epoch": 0.06869009584664537, + "grad_norm": 0.2708166837692261, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 86 + }, + { + "epoch": 0.0694888178913738, + "grad_norm": 0.10507494956254959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 87 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 0.1015392392873764, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 88 + }, + { + "epoch": 0.07108626198083066, + "grad_norm": 0.34002622961997986, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 89 + }, + { + "epoch": 0.07188498402555911, + "grad_norm": 0.5238372683525085, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 90 + }, + { + "epoch": 0.07268370607028754, + "grad_norm": 0.5267866253852844, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 91 + }, + { + "epoch": 0.07348242811501597, + "grad_norm": 0.3286864757537842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 92 + }, + { + "epoch": 0.0742811501597444, + "grad_norm": 0.14270304143428802, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 93 + }, + { + "epoch": 0.07507987220447285, + "grad_norm": 0.3481365740299225, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 94 + }, + { + "epoch": 0.07587859424920128, + "grad_norm": 0.33883902430534363, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 95 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.2553725838661194, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 96 + }, + { + "epoch": 0.07747603833865814, + "grad_norm": 0.21944141387939453, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 97 + }, + { + "epoch": 0.07827476038338659, + "grad_norm": 0.18821558356285095, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 98 + }, + { + "epoch": 0.07907348242811502, + "grad_norm": 0.20073482394218445, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 99 + }, + { + "epoch": 0.07987220447284345, + "grad_norm": 0.2643139958381653, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 100 + }, + { + "epoch": 0.08067092651757188, + "grad_norm": 0.1843930184841156, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 101 + }, + { + "epoch": 0.08146964856230032, + "grad_norm": 0.12745684385299683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 102 + }, + { + "epoch": 0.08226837060702875, + "grad_norm": 0.3252592384815216, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 103 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 0.33775797486305237, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 104 + }, + { + "epoch": 0.08386581469648563, + "grad_norm": 0.24846483767032623, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 105 + }, + { + "epoch": 0.08466453674121406, + "grad_norm": 0.1598653495311737, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 106 + }, + { + "epoch": 0.08546325878594249, + "grad_norm": 0.2555698752403259, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 107 + }, + { + "epoch": 0.08626198083067092, + "grad_norm": 0.3770487308502197, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 108 + }, + { + "epoch": 0.08706070287539937, + "grad_norm": 0.3179391026496887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 109 + }, + { + "epoch": 0.0878594249201278, + "grad_norm": 0.11638858914375305, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 110 + }, + { + "epoch": 0.08865814696485623, + "grad_norm": 0.20365215837955475, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 111 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.22354111075401306, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 112 + }, + { + "epoch": 0.0902555910543131, + "grad_norm": 0.1944236010313034, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 113 + }, + { + "epoch": 0.09105431309904154, + "grad_norm": 0.16177603602409363, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 114 + }, + { + "epoch": 0.09185303514376997, + "grad_norm": 0.06650812178850174, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 115 + }, + { + "epoch": 0.0926517571884984, + "grad_norm": 0.20236945152282715, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 116 + }, + { + "epoch": 0.09345047923322684, + "grad_norm": 0.19086670875549316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 117 + }, + { + "epoch": 0.09424920127795527, + "grad_norm": 0.17380473017692566, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 118 + }, + { + "epoch": 0.0950479233226837, + "grad_norm": 0.11360115557909012, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 119 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 0.09359298646450043, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 120 + }, + { + "epoch": 0.09664536741214058, + "grad_norm": 0.15317411720752716, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 121 + }, + { + "epoch": 0.09744408945686901, + "grad_norm": 0.05564137175679207, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 122 + }, + { + "epoch": 0.09824281150159744, + "grad_norm": 0.13476046919822693, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 123 + }, + { + "epoch": 0.09904153354632587, + "grad_norm": 0.11372318118810654, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 124 + }, + { + "epoch": 0.09984025559105432, + "grad_norm": 0.11330179125070572, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 125 + }, + { + "epoch": 0.10063897763578275, + "grad_norm": 0.11304716765880585, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 126 + }, + { + "epoch": 0.10143769968051118, + "grad_norm": 0.06369871646165848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 127 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.14034464955329895, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 128 + }, + { + "epoch": 0.10303514376996806, + "grad_norm": 0.1080808937549591, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 129 + }, + { + "epoch": 0.10383386581469649, + "grad_norm": 0.09568007290363312, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 130 + }, + { + "epoch": 0.10463258785942492, + "grad_norm": 0.1359473019838333, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 131 + }, + { + "epoch": 0.10543130990415335, + "grad_norm": 0.06500346213579178, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 132 + }, + { + "epoch": 0.1062300319488818, + "grad_norm": 0.11564832180738449, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 133 + }, + { + "epoch": 0.10702875399361023, + "grad_norm": 0.2115149199962616, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 134 + }, + { + "epoch": 0.10782747603833866, + "grad_norm": 0.3098243772983551, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 135 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 0.446521133184433, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 136 + }, + { + "epoch": 0.10942492012779553, + "grad_norm": 0.5194831490516663, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 137 + }, + { + "epoch": 0.11022364217252396, + "grad_norm": 0.447731077671051, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 138 + }, + { + "epoch": 0.1110223642172524, + "grad_norm": 0.2195945680141449, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 139 + }, + { + "epoch": 0.11182108626198083, + "grad_norm": 0.1277567446231842, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 140 + }, + { + "epoch": 0.11261980830670927, + "grad_norm": 0.3284558355808258, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 141 + }, + { + "epoch": 0.1134185303514377, + "grad_norm": 0.40208715200424194, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 142 + }, + { + "epoch": 0.11421725239616613, + "grad_norm": 0.28310486674308777, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 143 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.0786294937133789, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 144 + }, + { + "epoch": 0.11581469648562301, + "grad_norm": 0.18283484876155853, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 145 + }, + { + "epoch": 0.11661341853035144, + "grad_norm": 0.20186439156532288, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 146 + }, + { + "epoch": 0.11741214057507987, + "grad_norm": 0.15860706567764282, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 147 + }, + { + "epoch": 0.1182108626198083, + "grad_norm": 0.1436982899904251, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 148 + }, + { + "epoch": 0.11900958466453675, + "grad_norm": 0.15206722915172577, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 149 + }, + { + "epoch": 0.11980830670926518, + "grad_norm": 0.252279132604599, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 150 + }, + { + "epoch": 0.12060702875399361, + "grad_norm": 0.19411228597164154, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 151 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 0.07377714663743973, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 152 + }, + { + "epoch": 0.12220447284345048, + "grad_norm": 0.15493856370449066, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 153 + }, + { + "epoch": 0.12300319488817892, + "grad_norm": 0.275601863861084, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 154 + }, + { + "epoch": 0.12380191693290735, + "grad_norm": 0.42461103200912476, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 155 + }, + { + "epoch": 0.12460063897763578, + "grad_norm": 0.41153159737586975, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 156 + }, + { + "epoch": 0.1253993610223642, + "grad_norm": 0.2487967610359192, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 157 + }, + { + "epoch": 0.12619808306709265, + "grad_norm": 0.10687623918056488, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 158 + }, + { + "epoch": 0.1269968051118211, + "grad_norm": 0.28695282340049744, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 159 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.38554099202156067, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 160 + }, + { + "epoch": 0.12859424920127796, + "grad_norm": 0.25622498989105225, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 161 + }, + { + "epoch": 0.12939297124600638, + "grad_norm": 0.10341542959213257, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 162 + }, + { + "epoch": 0.13019169329073482, + "grad_norm": 0.20450755953788757, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 163 + }, + { + "epoch": 0.13099041533546327, + "grad_norm": 0.2664271295070648, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 164 + }, + { + "epoch": 0.13178913738019168, + "grad_norm": 0.23936089873313904, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 165 + }, + { + "epoch": 0.13258785942492013, + "grad_norm": 0.0662769302725792, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 166 + }, + { + "epoch": 0.13338658146964857, + "grad_norm": 0.13597780466079712, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 167 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 0.15996500849723816, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 168 + }, + { + "epoch": 0.13498402555910544, + "grad_norm": 0.10095447301864624, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 169 + }, + { + "epoch": 0.13578274760383385, + "grad_norm": 0.09733449667692184, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 170 + }, + { + "epoch": 0.1365814696485623, + "grad_norm": 0.16480964422225952, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 171 + }, + { + "epoch": 0.13738019169329074, + "grad_norm": 0.21611596643924713, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 172 + }, + { + "epoch": 0.13817891373801916, + "grad_norm": 0.21607941389083862, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 173 + }, + { + "epoch": 0.1389776357827476, + "grad_norm": 0.2234959453344345, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 174 + }, + { + "epoch": 0.13977635782747605, + "grad_norm": 0.10778137296438217, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 175 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.1758418083190918, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 176 + }, + { + "epoch": 0.1413738019169329, + "grad_norm": 0.30717936158180237, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 177 + }, + { + "epoch": 0.14217252396166133, + "grad_norm": 0.3382156789302826, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 178 + }, + { + "epoch": 0.14297124600638977, + "grad_norm": 0.23189185559749603, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 179 + }, + { + "epoch": 0.14376996805111822, + "grad_norm": 0.04988733306527138, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 180 + }, + { + "epoch": 0.14456869009584664, + "grad_norm": 0.15606579184532166, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 181 + }, + { + "epoch": 0.14536741214057508, + "grad_norm": 0.2366417795419693, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 182 + }, + { + "epoch": 0.14616613418530353, + "grad_norm": 0.21878089010715485, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 183 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 0.09316077083349228, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 184 + }, + { + "epoch": 0.1477635782747604, + "grad_norm": 0.119263656437397, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 185 + }, + { + "epoch": 0.1485623003194888, + "grad_norm": 0.26743847131729126, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 186 + }, + { + "epoch": 0.14936102236421725, + "grad_norm": 0.34438276290893555, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 187 + }, + { + "epoch": 0.1501597444089457, + "grad_norm": 0.30809128284454346, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 188 + }, + { + "epoch": 0.1509584664536741, + "grad_norm": 0.1406010240316391, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 189 + }, + { + "epoch": 0.15175718849840256, + "grad_norm": 0.09509757161140442, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 190 + }, + { + "epoch": 0.152555910543131, + "grad_norm": 0.24529854953289032, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 191 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.2803219258785248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 192 + }, + { + "epoch": 0.15415335463258786, + "grad_norm": 0.18221652507781982, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 193 + }, + { + "epoch": 0.15495207667731628, + "grad_norm": 0.04752795770764351, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 194 + }, + { + "epoch": 0.15575079872204473, + "grad_norm": 0.14151020348072052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 195 + }, + { + "epoch": 0.15654952076677317, + "grad_norm": 0.27345412969589233, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 196 + }, + { + "epoch": 0.1573482428115016, + "grad_norm": 0.36259710788726807, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 197 + }, + { + "epoch": 0.15814696485623003, + "grad_norm": 0.30899694561958313, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 198 + }, + { + "epoch": 0.15894568690095848, + "grad_norm": 0.148394376039505, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 199 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 0.09150427579879761, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 200 + }, + { + "epoch": 0.16054313099041534, + "grad_norm": 0.2579229176044464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 201 + }, + { + "epoch": 0.16134185303514376, + "grad_norm": 0.35417553782463074, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 202 + }, + { + "epoch": 0.1621405750798722, + "grad_norm": 0.3410634994506836, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 203 + }, + { + "epoch": 0.16293929712460065, + "grad_norm": 0.20597697794437408, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 204 + }, + { + "epoch": 0.16373801916932906, + "grad_norm": 0.09722702950239182, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 205 + }, + { + "epoch": 0.1645367412140575, + "grad_norm": 0.29214075207710266, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 206 + }, + { + "epoch": 0.16533546325878595, + "grad_norm": 0.35695526003837585, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 207 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.23948919773101807, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 208 + }, + { + "epoch": 0.16693290734824281, + "grad_norm": 0.06467479467391968, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 209 + }, + { + "epoch": 0.16773162939297126, + "grad_norm": 0.2935601472854614, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 210 + }, + { + "epoch": 0.16853035143769968, + "grad_norm": 0.3354688882827759, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 211 + }, + { + "epoch": 0.16932907348242812, + "grad_norm": 0.206736221909523, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 212 + }, + { + "epoch": 0.17012779552715654, + "grad_norm": 0.04770192503929138, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 213 + }, + { + "epoch": 0.17092651757188498, + "grad_norm": 0.11713571101427078, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 214 + }, + { + "epoch": 0.17172523961661343, + "grad_norm": 0.1751943975687027, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 215 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 0.11709283292293549, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 216 + }, + { + "epoch": 0.1733226837060703, + "grad_norm": 0.08393140882253647, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 217 + }, + { + "epoch": 0.17412140575079874, + "grad_norm": 0.14036497473716736, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 218 + }, + { + "epoch": 0.17492012779552715, + "grad_norm": 0.19809649884700775, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 219 + }, + { + "epoch": 0.1757188498402556, + "grad_norm": 0.16380994021892548, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 220 + }, + { + "epoch": 0.17651757188498401, + "grad_norm": 0.03721015155315399, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 221 + }, + { + "epoch": 0.17731629392971246, + "grad_norm": 0.16769659519195557, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 222 + }, + { + "epoch": 0.1781150159744409, + "grad_norm": 0.2506882846355438, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 223 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 0.2812851667404175, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 224 + }, + { + "epoch": 0.17971246006389777, + "grad_norm": 0.2518095374107361, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 225 + }, + { + "epoch": 0.1805111821086262, + "grad_norm": 0.13027259707450867, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 226 + }, + { + "epoch": 0.18130990415335463, + "grad_norm": 0.051758985966444016, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 227 + }, + { + "epoch": 0.18210862619808307, + "grad_norm": 0.123250812292099, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 228 + }, + { + "epoch": 0.1829073482428115, + "grad_norm": 0.16475827991962433, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 229 + }, + { + "epoch": 0.18370607028753994, + "grad_norm": 0.15224772691726685, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 230 + }, + { + "epoch": 0.18450479233226838, + "grad_norm": 0.10693283379077911, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 231 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 0.059128716588020325, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 232 + }, + { + "epoch": 0.18610223642172524, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 233 + }, + { + "epoch": 0.1869009584664537, + "grad_norm": 0.21447211503982544, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 234 + }, + { + "epoch": 0.1876996805111821, + "grad_norm": 0.214809849858284, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 235 + }, + { + "epoch": 0.18849840255591055, + "grad_norm": 0.16398873925209045, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 236 + }, + { + "epoch": 0.18929712460063897, + "grad_norm": 0.08273304253816605, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 237 + }, + { + "epoch": 0.1900958466453674, + "grad_norm": 0.08456159383058548, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 238 + }, + { + "epoch": 0.19089456869009586, + "grad_norm": 0.09653522819280624, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 239 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.13169406354427338, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 240 + }, + { + "epoch": 0.19249201277955272, + "grad_norm": 0.2328217476606369, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 241 + }, + { + "epoch": 0.19329073482428116, + "grad_norm": 0.2226463258266449, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 242 + }, + { + "epoch": 0.19408945686900958, + "grad_norm": 0.13330090045928955, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 243 + }, + { + "epoch": 0.19488817891373802, + "grad_norm": 0.15685412287712097, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 244 + }, + { + "epoch": 0.19568690095846644, + "grad_norm": 0.1528809666633606, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 245 + }, + { + "epoch": 0.1964856230031949, + "grad_norm": 0.2380320429801941, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 246 + }, + { + "epoch": 0.19728434504792333, + "grad_norm": 0.20447947084903717, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 247 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 0.162733793258667, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 248 + }, + { + "epoch": 0.1988817891373802, + "grad_norm": 0.10536827147006989, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 249 + }, + { + "epoch": 0.19968051118210864, + "grad_norm": 0.05464514344930649, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 250 + }, + { + "epoch": 0.20047923322683706, + "grad_norm": 0.052793700248003006, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 251 + }, + { + "epoch": 0.2012779552715655, + "grad_norm": 0.06936854124069214, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 252 + }, + { + "epoch": 0.20207667731629392, + "grad_norm": 0.17630355060100555, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 253 + }, + { + "epoch": 0.20287539936102236, + "grad_norm": 0.23443830013275146, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 254 + }, + { + "epoch": 0.2036741214057508, + "grad_norm": 0.21788854897022247, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 255 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.16827379167079926, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 256 + }, + { + "epoch": 0.20527156549520767, + "grad_norm": 0.08467451483011246, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 257 + }, + { + "epoch": 0.20607028753993611, + "grad_norm": 0.17747341096401215, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 258 + }, + { + "epoch": 0.20686900958466453, + "grad_norm": 0.20212751626968384, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 259 + }, + { + "epoch": 0.20766773162939298, + "grad_norm": 0.13319599628448486, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 260 + }, + { + "epoch": 0.2084664536741214, + "grad_norm": 0.13839752972126007, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 261 + }, + { + "epoch": 0.20926517571884984, + "grad_norm": 0.12351422011852264, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 262 + }, + { + "epoch": 0.21006389776357828, + "grad_norm": 0.1166408434510231, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 263 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 0.15500681102275848, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 264 + }, + { + "epoch": 0.21166134185303515, + "grad_norm": 0.045156076550483704, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 265 + }, + { + "epoch": 0.2124600638977636, + "grad_norm": 0.1413601189851761, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 266 + }, + { + "epoch": 0.213258785942492, + "grad_norm": 0.19309845566749573, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 267 + }, + { + "epoch": 0.21405750798722045, + "grad_norm": 0.22837650775909424, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 268 + }, + { + "epoch": 0.21485623003194887, + "grad_norm": 0.23372405767440796, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 269 + }, + { + "epoch": 0.21565495207667731, + "grad_norm": 0.2030618041753769, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 270 + }, + { + "epoch": 0.21645367412140576, + "grad_norm": 0.2092818021774292, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 271 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.18329963088035583, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 272 + }, + { + "epoch": 0.21805111821086262, + "grad_norm": 0.07353675365447998, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 273 + }, + { + "epoch": 0.21884984025559107, + "grad_norm": 0.08853492140769958, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 274 + }, + { + "epoch": 0.21964856230031948, + "grad_norm": 0.14666804671287537, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 275 + }, + { + "epoch": 0.22044728434504793, + "grad_norm": 0.12529602646827698, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 276 + }, + { + "epoch": 0.22124600638977635, + "grad_norm": 0.1571074277162552, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 277 + }, + { + "epoch": 0.2220447284345048, + "grad_norm": 0.09636949002742767, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 278 + }, + { + "epoch": 0.22284345047923323, + "grad_norm": 0.16803453862667084, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 279 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 0.258849561214447, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 280 + }, + { + "epoch": 0.2244408945686901, + "grad_norm": 0.29162102937698364, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 281 + }, + { + "epoch": 0.22523961661341854, + "grad_norm": 0.32085782289505005, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 282 + }, + { + "epoch": 0.22603833865814696, + "grad_norm": 0.24114084243774414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 283 + }, + { + "epoch": 0.2268370607028754, + "grad_norm": 0.11804991215467453, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 284 + }, + { + "epoch": 0.22763578274760382, + "grad_norm": 0.16640789806842804, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 285 + }, + { + "epoch": 0.22843450479233227, + "grad_norm": 0.33951282501220703, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 286 + }, + { + "epoch": 0.2292332268370607, + "grad_norm": 0.3939269483089447, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 287 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.2742229402065277, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 288 + }, + { + "epoch": 0.23083067092651757, + "grad_norm": 0.1000385507941246, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 289 + }, + { + "epoch": 0.23162939297124602, + "grad_norm": 0.15618765354156494, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 290 + }, + { + "epoch": 0.23242811501597443, + "grad_norm": 0.3464474081993103, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 291 + }, + { + "epoch": 0.23322683706070288, + "grad_norm": 0.4524421989917755, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 292 + }, + { + "epoch": 0.2340255591054313, + "grad_norm": 0.38890203833580017, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 293 + }, + { + "epoch": 0.23482428115015974, + "grad_norm": 0.15225796401500702, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 294 + }, + { + "epoch": 0.2356230031948882, + "grad_norm": 0.18742015957832336, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 295 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 0.454607754945755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 296 + }, + { + "epoch": 0.23722044728434505, + "grad_norm": 0.4426102638244629, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 297 + }, + { + "epoch": 0.2380191693290735, + "grad_norm": 0.1442587673664093, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 298 + }, + { + "epoch": 0.2388178913738019, + "grad_norm": 0.2338172197341919, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 299 + }, + { + "epoch": 0.23961661341853036, + "grad_norm": 0.4115936756134033, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 300 + }, + { + "epoch": 0.24041533546325877, + "grad_norm": 0.38746342062950134, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 301 + }, + { + "epoch": 0.24121405750798722, + "grad_norm": 0.11506912112236023, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 302 + }, + { + "epoch": 0.24201277955271566, + "grad_norm": 0.20454810559749603, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 303 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.34620603919029236, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 304 + }, + { + "epoch": 0.24361022364217252, + "grad_norm": 0.27727624773979187, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 305 + }, + { + "epoch": 0.24440894568690097, + "grad_norm": 0.062395140528678894, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 306 + }, + { + "epoch": 0.2452076677316294, + "grad_norm": 0.25391891598701477, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 307 + }, + { + "epoch": 0.24600638977635783, + "grad_norm": 0.3807840049266815, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 308 + }, + { + "epoch": 0.24680511182108625, + "grad_norm": 0.31564414501190186, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 309 + }, + { + "epoch": 0.2476038338658147, + "grad_norm": 0.044667672365903854, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 310 + }, + { + "epoch": 0.24840255591054314, + "grad_norm": 0.2656041979789734, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 311 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 0.2954655587673187, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 312 + }, + { + "epoch": 0.25, + "grad_norm": 0.14636820554733276, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 313 + }, + { + "epoch": 0.2507987220447284, + "grad_norm": 0.16759099066257477, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 314 + }, + { + "epoch": 0.2515974440894569, + "grad_norm": 0.28777605295181274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 315 + }, + { + "epoch": 0.2523961661341853, + "grad_norm": 0.2817089855670929, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 316 + }, + { + "epoch": 0.2531948881789137, + "grad_norm": 0.09457004815340042, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 317 + }, + { + "epoch": 0.2539936102236422, + "grad_norm": 0.15224558115005493, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 318 + }, + { + "epoch": 0.2547923322683706, + "grad_norm": 0.17883236706256866, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 319 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.08269336074590683, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 320 + }, + { + "epoch": 0.2563897763578275, + "grad_norm": 0.10430650413036346, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 321 + }, + { + "epoch": 0.2571884984025559, + "grad_norm": 0.06464210897684097, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 322 + }, + { + "epoch": 0.25798722044728434, + "grad_norm": 0.08100844919681549, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 323 + }, + { + "epoch": 0.25878594249201275, + "grad_norm": 0.10375291109085083, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 324 + }, + { + "epoch": 0.2595846645367412, + "grad_norm": 0.14621509611606598, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 325 + }, + { + "epoch": 0.26038338658146964, + "grad_norm": 0.12707975506782532, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 326 + }, + { + "epoch": 0.26118210862619806, + "grad_norm": 0.04542430862784386, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 327 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 0.13504259288311005, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 328 + }, + { + "epoch": 0.26277955271565495, + "grad_norm": 0.20337320864200592, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 329 + }, + { + "epoch": 0.26357827476038337, + "grad_norm": 0.23682020604610443, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 330 + }, + { + "epoch": 0.26437699680511184, + "grad_norm": 0.15198387205600739, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 331 + }, + { + "epoch": 0.26517571884984026, + "grad_norm": 0.04014969989657402, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 332 + }, + { + "epoch": 0.2659744408945687, + "grad_norm": 0.10505357384681702, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 333 + }, + { + "epoch": 0.26677316293929715, + "grad_norm": 0.08121145516633987, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 334 + }, + { + "epoch": 0.26757188498402557, + "grad_norm": 0.062118109315633774, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 335 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.13389311730861664, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 336 + }, + { + "epoch": 0.26916932907348246, + "grad_norm": 0.24840199947357178, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 337 + }, + { + "epoch": 0.26996805111821087, + "grad_norm": 0.33511659502983093, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 338 + }, + { + "epoch": 0.2707667731629393, + "grad_norm": 0.2905866801738739, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 339 + }, + { + "epoch": 0.2715654952076677, + "grad_norm": 0.15471668541431427, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 340 + }, + { + "epoch": 0.2723642172523962, + "grad_norm": 0.09973842650651932, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 341 + }, + { + "epoch": 0.2731629392971246, + "grad_norm": 0.19315758347511292, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 342 + }, + { + "epoch": 0.273961661341853, + "grad_norm": 0.2122231423854828, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 343 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 0.11207931488752365, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 344 + }, + { + "epoch": 0.2755591054313099, + "grad_norm": 0.11863203346729279, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 345 + }, + { + "epoch": 0.2763578274760383, + "grad_norm": 0.22022183239459991, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 346 + }, + { + "epoch": 0.2771565495207668, + "grad_norm": 0.225724458694458, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 347 + }, + { + "epoch": 0.2779552715654952, + "grad_norm": 0.1622191071510315, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 348 + }, + { + "epoch": 0.2787539936102236, + "grad_norm": 0.05987359210848808, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 349 + }, + { + "epoch": 0.2795527156549521, + "grad_norm": 0.08514829725027084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 350 + }, + { + "epoch": 0.2803514376996805, + "grad_norm": 0.10734611004590988, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 351 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.12458663433790207, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 352 + }, + { + "epoch": 0.2819488817891374, + "grad_norm": 0.12223048508167267, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 353 + }, + { + "epoch": 0.2827476038338658, + "grad_norm": 0.0663333311676979, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 354 + }, + { + "epoch": 0.28354632587859424, + "grad_norm": 0.0628359317779541, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 355 + }, + { + "epoch": 0.28434504792332266, + "grad_norm": 0.1566074788570404, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 356 + }, + { + "epoch": 0.28514376996805113, + "grad_norm": 0.23291122913360596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 357 + }, + { + "epoch": 0.28594249201277955, + "grad_norm": 0.21403467655181885, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 358 + }, + { + "epoch": 0.28674121405750796, + "grad_norm": 0.08412498980760574, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 359 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 0.1415901631116867, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 360 + }, + { + "epoch": 0.28833865814696485, + "grad_norm": 0.29960349202156067, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 361 + }, + { + "epoch": 0.28913738019169327, + "grad_norm": 0.33849450945854187, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 362 + }, + { + "epoch": 0.28993610223642174, + "grad_norm": 0.24428068101406097, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 363 + }, + { + "epoch": 0.29073482428115016, + "grad_norm": 0.07897785305976868, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 364 + }, + { + "epoch": 0.2915335463258786, + "grad_norm": 0.1347426027059555, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 365 + }, + { + "epoch": 0.29233226837060705, + "grad_norm": 0.21387724578380585, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 366 + }, + { + "epoch": 0.29313099041533547, + "grad_norm": 0.13869348168373108, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 367 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.062060993164777756, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 368 + }, + { + "epoch": 0.29472843450479236, + "grad_norm": 0.13848915696144104, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 369 + }, + { + "epoch": 0.2955271565495208, + "grad_norm": 0.12179117649793625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 370 + }, + { + "epoch": 0.2963258785942492, + "grad_norm": 0.13039280474185944, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 371 + }, + { + "epoch": 0.2971246006389776, + "grad_norm": 0.09119348227977753, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 372 + }, + { + "epoch": 0.2979233226837061, + "grad_norm": 0.06374438107013702, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 373 + }, + { + "epoch": 0.2987220447284345, + "grad_norm": 0.1524113267660141, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 374 + }, + { + "epoch": 0.2995207667731629, + "grad_norm": 0.18103912472724915, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 375 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 0.1439986377954483, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 376 + }, + { + "epoch": 0.3011182108626198, + "grad_norm": 0.1268371045589447, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 377 + }, + { + "epoch": 0.3019169329073482, + "grad_norm": 0.07370569556951523, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 378 + }, + { + "epoch": 0.3027156549520767, + "grad_norm": 0.0718536451458931, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 379 + }, + { + "epoch": 0.3035143769968051, + "grad_norm": 0.10444384068250656, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 380 + }, + { + "epoch": 0.30431309904153353, + "grad_norm": 0.10085552930831909, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 381 + }, + { + "epoch": 0.305111821086262, + "grad_norm": 0.08599484711885452, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 382 + }, + { + "epoch": 0.3059105431309904, + "grad_norm": 0.08912923187017441, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 383 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.17919759452342987, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 384 + }, + { + "epoch": 0.3075079872204473, + "grad_norm": 0.23954501748085022, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 385 + }, + { + "epoch": 0.3083067092651757, + "grad_norm": 0.2940942645072937, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 386 + }, + { + "epoch": 0.30910543130990414, + "grad_norm": 0.2905970513820648, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 387 + }, + { + "epoch": 0.30990415335463256, + "grad_norm": 0.2555491626262665, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 388 + }, + { + "epoch": 0.31070287539936103, + "grad_norm": 0.15303272008895874, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 389 + }, + { + "epoch": 0.31150159744408945, + "grad_norm": 0.10148895531892776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 390 + }, + { + "epoch": 0.31230031948881787, + "grad_norm": 0.21828792989253998, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 391 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 0.27219685912132263, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 392 + }, + { + "epoch": 0.31389776357827476, + "grad_norm": 0.3431699872016907, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 393 + }, + { + "epoch": 0.3146964856230032, + "grad_norm": 0.32346805930137634, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 394 + }, + { + "epoch": 0.31549520766773165, + "grad_norm": 0.17791730165481567, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 395 + }, + { + "epoch": 0.31629392971246006, + "grad_norm": 0.09576063603162766, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 396 + }, + { + "epoch": 0.3170926517571885, + "grad_norm": 0.050598498433828354, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 397 + }, + { + "epoch": 0.31789137380191695, + "grad_norm": 0.07385009527206421, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 398 + }, + { + "epoch": 0.31869009584664537, + "grad_norm": 0.08680527657270432, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 399 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.06436332315206528, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 400 + }, + { + "epoch": 0.32028753993610226, + "grad_norm": 0.05943639203906059, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 401 + }, + { + "epoch": 0.3210862619808307, + "grad_norm": 0.10015929490327835, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 402 + }, + { + "epoch": 0.3218849840255591, + "grad_norm": 0.07852698862552643, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 403 + }, + { + "epoch": 0.3226837060702875, + "grad_norm": 0.06103534996509552, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 404 + }, + { + "epoch": 0.323482428115016, + "grad_norm": 0.04573113098740578, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 405 + }, + { + "epoch": 0.3242811501597444, + "grad_norm": 0.06108849495649338, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 406 + }, + { + "epoch": 0.3250798722044728, + "grad_norm": 0.10209841281175613, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 407 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 0.0956021398305893, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 408 + }, + { + "epoch": 0.3266773162939297, + "grad_norm": 0.12572422623634338, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 409 + }, + { + "epoch": 0.3274760383386581, + "grad_norm": 0.1532585173845291, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 410 + }, + { + "epoch": 0.3282747603833866, + "grad_norm": 0.10664337128400803, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 411 + }, + { + "epoch": 0.329073482428115, + "grad_norm": 0.07705336064100266, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 412 + }, + { + "epoch": 0.32987220447284343, + "grad_norm": 0.08611477166414261, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 413 + }, + { + "epoch": 0.3306709265175719, + "grad_norm": 0.11460789293050766, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 414 + }, + { + "epoch": 0.3314696485623003, + "grad_norm": 0.1214505136013031, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 415 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.07482243329286575, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 416 + }, + { + "epoch": 0.3330670926517572, + "grad_norm": 0.05022026225924492, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 417 + }, + { + "epoch": 0.33386581469648563, + "grad_norm": 0.086161769926548, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 418 + }, + { + "epoch": 0.33466453674121405, + "grad_norm": 0.05073339864611626, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 419 + }, + { + "epoch": 0.3354632587859425, + "grad_norm": 0.0925290584564209, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 420 + }, + { + "epoch": 0.33626198083067094, + "grad_norm": 0.08073565363883972, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 421 + }, + { + "epoch": 0.33706070287539935, + "grad_norm": 0.06067343428730965, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 422 + }, + { + "epoch": 0.33785942492012777, + "grad_norm": 0.16081079840660095, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 423 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 0.3043743371963501, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 424 + }, + { + "epoch": 0.33945686900958466, + "grad_norm": 0.32498979568481445, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 425 + }, + { + "epoch": 0.3402555910543131, + "grad_norm": 0.206096351146698, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 426 + }, + { + "epoch": 0.34105431309904155, + "grad_norm": 0.11892937123775482, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 427 + }, + { + "epoch": 0.34185303514376997, + "grad_norm": 0.19896888732910156, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 428 + }, + { + "epoch": 0.3426517571884984, + "grad_norm": 0.3295411169528961, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 429 + }, + { + "epoch": 0.34345047923322686, + "grad_norm": 0.3841599225997925, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 430 + }, + { + "epoch": 0.3442492012779553, + "grad_norm": 0.36113840341567993, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 431 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.25694623589515686, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 432 + }, + { + "epoch": 0.34584664536741216, + "grad_norm": 0.07741750776767731, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 433 + }, + { + "epoch": 0.3466453674121406, + "grad_norm": 0.1385476440191269, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 434 + }, + { + "epoch": 0.347444089456869, + "grad_norm": 0.22972947359085083, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 435 + }, + { + "epoch": 0.34824281150159747, + "grad_norm": 0.15720337629318237, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 436 + }, + { + "epoch": 0.3490415335463259, + "grad_norm": 0.04451138526201248, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 437 + }, + { + "epoch": 0.3498402555910543, + "grad_norm": 0.15054486691951752, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 438 + }, + { + "epoch": 0.3506389776357827, + "grad_norm": 0.16740895807743073, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 439 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 0.1388419270515442, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 440 + }, + { + "epoch": 0.3522364217252396, + "grad_norm": 0.06480700522661209, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 441 + }, + { + "epoch": 0.35303514376996803, + "grad_norm": 0.09604794532060623, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 442 + }, + { + "epoch": 0.3538338658146965, + "grad_norm": 0.174916610121727, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.3546325878594249, + "grad_norm": 0.2228047251701355, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 444 + }, + { + "epoch": 0.35543130990415334, + "grad_norm": 0.24461773037910461, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 445 + }, + { + "epoch": 0.3562300319488818, + "grad_norm": 0.2201017141342163, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 446 + }, + { + "epoch": 0.3570287539936102, + "grad_norm": 0.11596337705850601, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 447 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.1682164967060089, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 448 + }, + { + "epoch": 0.3586261980830671, + "grad_norm": 0.4297041594982147, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 449 + }, + { + "epoch": 0.35942492012779553, + "grad_norm": 0.5659548044204712, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 450 + }, + { + "epoch": 0.36022364217252395, + "grad_norm": 0.5303114652633667, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 451 + }, + { + "epoch": 0.3610223642172524, + "grad_norm": 0.23788955807685852, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 452 + }, + { + "epoch": 0.36182108626198084, + "grad_norm": 0.15622566640377045, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 453 + }, + { + "epoch": 0.36261980830670926, + "grad_norm": 0.327275812625885, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 454 + }, + { + "epoch": 0.3634185303514377, + "grad_norm": 0.23511037230491638, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 455 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 0.11690831184387207, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 456 + }, + { + "epoch": 0.36501597444089456, + "grad_norm": 0.17950886487960815, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 457 + }, + { + "epoch": 0.365814696485623, + "grad_norm": 0.13816051185131073, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 458 + }, + { + "epoch": 0.36661341853035145, + "grad_norm": 0.09056458622217178, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 459 + }, + { + "epoch": 0.36741214057507987, + "grad_norm": 0.1648412048816681, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 460 + }, + { + "epoch": 0.3682108626198083, + "grad_norm": 0.24407249689102173, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 461 + }, + { + "epoch": 0.36900958466453676, + "grad_norm": 0.1896992176771164, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 462 + }, + { + "epoch": 0.3698083067092652, + "grad_norm": 0.07938385009765625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 463 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.10241381078958511, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 464 + }, + { + "epoch": 0.37140575079872207, + "grad_norm": 0.14765797555446625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 465 + }, + { + "epoch": 0.3722044728434505, + "grad_norm": 0.11189796775579453, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 466 + }, + { + "epoch": 0.3730031948881789, + "grad_norm": 0.05604114383459091, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 467 + }, + { + "epoch": 0.3738019169329074, + "grad_norm": 0.18633529543876648, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 468 + }, + { + "epoch": 0.3746006389776358, + "grad_norm": 0.2587120234966278, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 469 + }, + { + "epoch": 0.3753993610223642, + "grad_norm": 0.21629218757152557, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 470 + }, + { + "epoch": 0.3761980830670926, + "grad_norm": 0.11872006952762604, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 471 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 0.07732011377811432, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 472 + }, + { + "epoch": 0.3777955271565495, + "grad_norm": 0.20141537487506866, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 473 + }, + { + "epoch": 0.37859424920127793, + "grad_norm": 0.26726409792900085, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 474 + }, + { + "epoch": 0.3793929712460064, + "grad_norm": 0.2373354583978653, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 475 + }, + { + "epoch": 0.3801916932907348, + "grad_norm": 0.15030571818351746, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 476 + }, + { + "epoch": 0.38099041533546324, + "grad_norm": 0.05345006287097931, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 477 + }, + { + "epoch": 0.3817891373801917, + "grad_norm": 0.12551648914813995, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 478 + }, + { + "epoch": 0.38258785942492013, + "grad_norm": 0.14036186039447784, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 479 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.09807970374822617, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 480 + }, + { + "epoch": 0.384185303514377, + "grad_norm": 0.05071088671684265, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 481 + }, + { + "epoch": 0.38498402555910544, + "grad_norm": 0.07541649043560028, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 482 + }, + { + "epoch": 0.38578274760383385, + "grad_norm": 0.059762127697467804, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 483 + }, + { + "epoch": 0.3865814696485623, + "grad_norm": 0.05540496110916138, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 484 + }, + { + "epoch": 0.38738019169329074, + "grad_norm": 0.09137953072786331, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 485 + }, + { + "epoch": 0.38817891373801916, + "grad_norm": 0.1349237710237503, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 486 + }, + { + "epoch": 0.3889776357827476, + "grad_norm": 0.13889296352863312, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 487 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 0.16406965255737305, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 488 + }, + { + "epoch": 0.39057507987220447, + "grad_norm": 0.1748959869146347, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 489 + }, + { + "epoch": 0.3913738019169329, + "grad_norm": 0.1518068015575409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 490 + }, + { + "epoch": 0.39217252396166136, + "grad_norm": 0.06694433838129044, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 491 + }, + { + "epoch": 0.3929712460063898, + "grad_norm": 0.11556574702262878, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 492 + }, + { + "epoch": 0.3937699680511182, + "grad_norm": 0.2562897801399231, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 493 + }, + { + "epoch": 0.39456869009584666, + "grad_norm": 0.30842337012290955, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 494 + }, + { + "epoch": 0.3953674121405751, + "grad_norm": 0.30477815866470337, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 495 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.2602941691875458, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 496 + }, + { + "epoch": 0.39696485623003197, + "grad_norm": 0.1692838817834854, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 497 + }, + { + "epoch": 0.3977635782747604, + "grad_norm": 0.07468903064727783, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 498 + }, + { + "epoch": 0.3985623003194888, + "grad_norm": 0.05872616916894913, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 499 + }, + { + "epoch": 0.3993610223642173, + "grad_norm": 0.09878433495759964, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 500 + }, + { + "epoch": 0.4001597444089457, + "grad_norm": 0.13779069483280182, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 501 + }, + { + "epoch": 0.4009584664536741, + "grad_norm": 0.17778213322162628, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 502 + }, + { + "epoch": 0.40175718849840253, + "grad_norm": 0.15572750568389893, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 503 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 0.1154002770781517, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 504 + }, + { + "epoch": 0.4033546325878594, + "grad_norm": 0.04485362395644188, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 505 + }, + { + "epoch": 0.40415335463258784, + "grad_norm": 0.07514321058988571, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 506 + }, + { + "epoch": 0.4049520766773163, + "grad_norm": 0.13954220712184906, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 507 + }, + { + "epoch": 0.4057507987220447, + "grad_norm": 0.20726922154426575, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 508 + }, + { + "epoch": 0.40654952076677314, + "grad_norm": 0.28239160776138306, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 509 + }, + { + "epoch": 0.4073482428115016, + "grad_norm": 0.28484129905700684, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 510 + }, + { + "epoch": 0.40814696485623003, + "grad_norm": 0.28111377358436584, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 511 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.25087496638298035, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 512 + }, + { + "epoch": 0.4097444089456869, + "grad_norm": 0.1652008444070816, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 513 + }, + { + "epoch": 0.41054313099041534, + "grad_norm": 0.11345700174570084, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 514 + }, + { + "epoch": 0.41134185303514376, + "grad_norm": 0.1191159337759018, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 515 + }, + { + "epoch": 0.41214057507987223, + "grad_norm": 0.26302817463874817, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 516 + }, + { + "epoch": 0.41293929712460065, + "grad_norm": 0.3303217589855194, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 517 + }, + { + "epoch": 0.41373801916932906, + "grad_norm": 0.2874647378921509, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 518 + }, + { + "epoch": 0.4145367412140575, + "grad_norm": 0.23112182319164276, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 519 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 0.16285021603107452, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 520 + }, + { + "epoch": 0.41613418530351437, + "grad_norm": 0.08440099656581879, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 521 + }, + { + "epoch": 0.4169329073482428, + "grad_norm": 0.03578028455376625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 522 + }, + { + "epoch": 0.41773162939297126, + "grad_norm": 0.0995275005698204, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 523 + }, + { + "epoch": 0.4185303514376997, + "grad_norm": 0.17713160812854767, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 524 + }, + { + "epoch": 0.4193290734824281, + "grad_norm": 0.1685509830713272, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 525 + }, + { + "epoch": 0.42012779552715657, + "grad_norm": 0.11357919126749039, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 526 + }, + { + "epoch": 0.420926517571885, + "grad_norm": 0.059025365859270096, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 527 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.05128806456923485, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 528 + }, + { + "epoch": 0.4225239616613419, + "grad_norm": 0.05291247367858887, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 529 + }, + { + "epoch": 0.4233226837060703, + "grad_norm": 0.10755500197410583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 530 + }, + { + "epoch": 0.4241214057507987, + "grad_norm": 0.15659615397453308, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 531 + }, + { + "epoch": 0.4249201277955272, + "grad_norm": 0.19369953870773315, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 532 + }, + { + "epoch": 0.4257188498402556, + "grad_norm": 0.16491396725177765, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 533 + }, + { + "epoch": 0.426517571884984, + "grad_norm": 0.10276799649000168, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 534 + }, + { + "epoch": 0.4273162939297125, + "grad_norm": 0.06273368000984192, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 535 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 0.03896406292915344, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 536 + }, + { + "epoch": 0.4289137380191693, + "grad_norm": 0.08083273470401764, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 537 + }, + { + "epoch": 0.42971246006389774, + "grad_norm": 0.05107828602194786, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 538 + }, + { + "epoch": 0.4305111821086262, + "grad_norm": 0.04359392821788788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 539 + }, + { + "epoch": 0.43130990415335463, + "grad_norm": 0.04225402697920799, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 540 + }, + { + "epoch": 0.43210862619808305, + "grad_norm": 0.07523404061794281, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 541 + }, + { + "epoch": 0.4329073482428115, + "grad_norm": 0.07966417819261551, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 542 + }, + { + "epoch": 0.43370607028753994, + "grad_norm": 0.04529299959540367, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 543 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.0793156549334526, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 544 + }, + { + "epoch": 0.4353035143769968, + "grad_norm": 0.1533992737531662, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 545 + }, + { + "epoch": 0.43610223642172524, + "grad_norm": 0.2893797755241394, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 546 + }, + { + "epoch": 0.43690095846645366, + "grad_norm": 0.4145842492580414, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 547 + }, + { + "epoch": 0.43769968051118213, + "grad_norm": 0.4550987482070923, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 548 + }, + { + "epoch": 0.43849840255591055, + "grad_norm": 0.4318651556968689, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 549 + }, + { + "epoch": 0.43929712460063897, + "grad_norm": 0.35961681604385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 550 + }, + { + "epoch": 0.44009584664536744, + "grad_norm": 0.18606753647327423, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 551 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 0.12992478907108307, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 552 + }, + { + "epoch": 0.4416932907348243, + "grad_norm": 0.32936930656433105, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 553 + }, + { + "epoch": 0.4424920127795527, + "grad_norm": 0.3547491133213043, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 554 + }, + { + "epoch": 0.44329073482428116, + "grad_norm": 0.2144627720117569, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 555 + }, + { + "epoch": 0.4440894568690096, + "grad_norm": 0.07260395586490631, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 556 + }, + { + "epoch": 0.444888178913738, + "grad_norm": 0.19895662367343903, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 557 + }, + { + "epoch": 0.44568690095846647, + "grad_norm": 0.18664990365505219, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 558 + }, + { + "epoch": 0.4464856230031949, + "grad_norm": 0.11666610836982727, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 559 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.11163592338562012, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 560 + }, + { + "epoch": 0.4480830670926518, + "grad_norm": 0.1815878301858902, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 561 + }, + { + "epoch": 0.4488817891373802, + "grad_norm": 0.2593924105167389, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 562 + }, + { + "epoch": 0.4496805111821086, + "grad_norm": 0.20761220157146454, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 563 + }, + { + "epoch": 0.4504792332268371, + "grad_norm": 0.06589766591787338, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 564 + }, + { + "epoch": 0.4512779552715655, + "grad_norm": 0.21619920432567596, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 565 + }, + { + "epoch": 0.4520766773162939, + "grad_norm": 0.2392708659172058, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 566 + }, + { + "epoch": 0.4528753993610224, + "grad_norm": 0.23214633762836456, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 567 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 0.263883501291275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 568 + }, + { + "epoch": 0.4544728434504792, + "grad_norm": 0.19914190471172333, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 569 + }, + { + "epoch": 0.45527156549520764, + "grad_norm": 0.11453433334827423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 570 + }, + { + "epoch": 0.4560702875399361, + "grad_norm": 0.15091221034526825, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 571 + }, + { + "epoch": 0.45686900958466453, + "grad_norm": 0.043582383543252945, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 572 + }, + { + "epoch": 0.45766773162939295, + "grad_norm": 0.14068740606307983, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 573 + }, + { + "epoch": 0.4584664536741214, + "grad_norm": 0.1274290233850479, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 574 + }, + { + "epoch": 0.45926517571884984, + "grad_norm": 0.13504599034786224, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 575 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.1267779916524887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 576 + }, + { + "epoch": 0.46086261980830673, + "grad_norm": 0.08138085901737213, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 577 + }, + { + "epoch": 0.46166134185303515, + "grad_norm": 0.07772356271743774, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 578 + }, + { + "epoch": 0.46246006389776356, + "grad_norm": 0.06863631308078766, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 579 + }, + { + "epoch": 0.46325878594249204, + "grad_norm": 0.1232575923204422, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 580 + }, + { + "epoch": 0.46405750798722045, + "grad_norm": 0.179134801030159, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 581 + }, + { + "epoch": 0.46485623003194887, + "grad_norm": 0.20545582473278046, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 582 + }, + { + "epoch": 0.46565495207667734, + "grad_norm": 0.14182575047016144, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 583 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 0.05813328176736832, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 584 + }, + { + "epoch": 0.4672523961661342, + "grad_norm": 0.1530984789133072, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 585 + }, + { + "epoch": 0.4680511182108626, + "grad_norm": 0.2820036709308624, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 586 + }, + { + "epoch": 0.46884984025559107, + "grad_norm": 0.39252954721450806, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 587 + }, + { + "epoch": 0.4696485623003195, + "grad_norm": 0.40830549597740173, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 588 + }, + { + "epoch": 0.4704472843450479, + "grad_norm": 0.2846182882785797, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 589 + }, + { + "epoch": 0.4712460063897764, + "grad_norm": 0.06798163801431656, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 590 + }, + { + "epoch": 0.4720447284345048, + "grad_norm": 0.18650950491428375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 591 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.2965260446071625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 592 + }, + { + "epoch": 0.4736421725239617, + "grad_norm": 0.24504852294921875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 593 + }, + { + "epoch": 0.4744408945686901, + "grad_norm": 0.11336984485387802, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 594 + }, + { + "epoch": 0.4752396166134185, + "grad_norm": 0.09007567912340164, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 595 + }, + { + "epoch": 0.476038338658147, + "grad_norm": 0.225834459066391, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 596 + }, + { + "epoch": 0.4768370607028754, + "grad_norm": 0.2679842710494995, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 597 + }, + { + "epoch": 0.4776357827476038, + "grad_norm": 0.1801901012659073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 598 + }, + { + "epoch": 0.4784345047923323, + "grad_norm": 0.09554167836904526, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 599 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 0.046632468700408936, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 600 + }, + { + "epoch": 0.48003194888178913, + "grad_norm": 0.12078758329153061, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 601 + }, + { + "epoch": 0.48083067092651754, + "grad_norm": 0.12126865237951279, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 602 + }, + { + "epoch": 0.481629392971246, + "grad_norm": 0.14078640937805176, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 603 + }, + { + "epoch": 0.48242811501597443, + "grad_norm": 0.18556037545204163, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 604 + }, + { + "epoch": 0.48322683706070285, + "grad_norm": 0.178151473402977, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 605 + }, + { + "epoch": 0.4840255591054313, + "grad_norm": 0.1672516018152237, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 606 + }, + { + "epoch": 0.48482428115015974, + "grad_norm": 0.11648737639188766, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 607 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.11820051819086075, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 608 + }, + { + "epoch": 0.48642172523961663, + "grad_norm": 0.21110932528972626, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 609 + }, + { + "epoch": 0.48722044728434505, + "grad_norm": 0.24852754175662994, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 610 + }, + { + "epoch": 0.48801916932907347, + "grad_norm": 0.2633175551891327, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 611 + }, + { + "epoch": 0.48881789137380194, + "grad_norm": 0.21904303133487701, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 612 + }, + { + "epoch": 0.48961661341853036, + "grad_norm": 0.07822466641664505, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 613 + }, + { + "epoch": 0.4904153354632588, + "grad_norm": 0.0767827108502388, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 614 + }, + { + "epoch": 0.49121405750798725, + "grad_norm": 0.07943699508905411, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 615 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 0.055741772055625916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 616 + }, + { + "epoch": 0.4928115015974441, + "grad_norm": 0.10400068014860153, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 617 + }, + { + "epoch": 0.4936102236421725, + "grad_norm": 0.05080602690577507, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 618 + }, + { + "epoch": 0.49440894568690097, + "grad_norm": 0.07927533984184265, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 619 + }, + { + "epoch": 0.4952076677316294, + "grad_norm": 0.07919944822788239, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 620 + }, + { + "epoch": 0.4960063897763578, + "grad_norm": 0.11013699322938919, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 621 + }, + { + "epoch": 0.4968051118210863, + "grad_norm": 0.16232389211654663, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 622 + }, + { + "epoch": 0.4976038338658147, + "grad_norm": 0.17625346779823303, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 623 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.1681327521800995, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 624 + }, + { + "epoch": 0.4992012779552716, + "grad_norm": 0.1882159262895584, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 625 + }, + { + "epoch": 0.5, + "grad_norm": 0.21075129508972168, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 626 + }, + { + "epoch": 0.5007987220447284, + "grad_norm": 0.1464296281337738, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 627 + }, + { + "epoch": 0.5015974440894568, + "grad_norm": 0.11155212670564651, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 628 + }, + { + "epoch": 0.5023961661341853, + "grad_norm": 0.09794416278600693, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 629 + }, + { + "epoch": 0.5031948881789138, + "grad_norm": 0.12095183879137039, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 630 + }, + { + "epoch": 0.5039936102236422, + "grad_norm": 0.1933794617652893, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 631 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 0.32272887229919434, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 632 + }, + { + "epoch": 0.505591054313099, + "grad_norm": 0.2507671117782593, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 633 + }, + { + "epoch": 0.5063897763578274, + "grad_norm": 0.09540661424398422, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 634 + }, + { + "epoch": 0.5071884984025559, + "grad_norm": 0.07341819256544113, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 635 + }, + { + "epoch": 0.5079872204472844, + "grad_norm": 0.11610874533653259, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 636 + }, + { + "epoch": 0.5087859424920128, + "grad_norm": 0.1338607519865036, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 637 + }, + { + "epoch": 0.5095846645367412, + "grad_norm": 0.07892445474863052, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 638 + }, + { + "epoch": 0.5103833865814696, + "grad_norm": 0.053661834448575974, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 639 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.06852453202009201, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 640 + }, + { + "epoch": 0.5119808306709265, + "grad_norm": 0.045109208673238754, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 641 + }, + { + "epoch": 0.512779552715655, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 642 + }, + { + "epoch": 0.5135782747603834, + "grad_norm": 0.05903350189328194, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 643 + }, + { + "epoch": 0.5143769968051118, + "grad_norm": 0.07314767688512802, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 644 + }, + { + "epoch": 0.5151757188498403, + "grad_norm": 0.12484236806631088, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 645 + }, + { + "epoch": 0.5159744408945687, + "grad_norm": 0.15683352947235107, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 646 + }, + { + "epoch": 0.5167731629392971, + "grad_norm": 0.13519413769245148, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 647 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 0.10333485156297684, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 648 + }, + { + "epoch": 0.518370607028754, + "grad_norm": 0.09626923501491547, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 649 + }, + { + "epoch": 0.5191693290734825, + "grad_norm": 0.08177447319030762, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 650 + }, + { + "epoch": 0.5199680511182109, + "grad_norm": 0.04186684265732765, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 651 + }, + { + "epoch": 0.5207667731629393, + "grad_norm": 0.07705547660589218, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 652 + }, + { + "epoch": 0.5215654952076677, + "grad_norm": 0.05885700136423111, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 653 + }, + { + "epoch": 0.5223642172523961, + "grad_norm": 0.14140211045742035, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 654 + }, + { + "epoch": 0.5231629392971247, + "grad_norm": 0.18797138333320618, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 655 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.2301982045173645, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 656 + }, + { + "epoch": 0.5247603833865815, + "grad_norm": 0.2813114523887634, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 657 + }, + { + "epoch": 0.5255591054313099, + "grad_norm": 0.3205592930316925, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 658 + }, + { + "epoch": 0.5263578274760383, + "grad_norm": 0.3426150381565094, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 659 + }, + { + "epoch": 0.5271565495207667, + "grad_norm": 0.2636663615703583, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 660 + }, + { + "epoch": 0.5279552715654952, + "grad_norm": 0.14799079298973083, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 661 + }, + { + "epoch": 0.5287539936102237, + "grad_norm": 0.06354992836713791, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 662 + }, + { + "epoch": 0.5295527156549521, + "grad_norm": 0.239300936460495, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 663 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 0.33535388112068176, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 664 + }, + { + "epoch": 0.5311501597444089, + "grad_norm": 0.32471078634262085, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 665 + }, + { + "epoch": 0.5319488817891374, + "grad_norm": 0.2491266429424286, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 666 + }, + { + "epoch": 0.5327476038338658, + "grad_norm": 0.09841614216566086, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 667 + }, + { + "epoch": 0.5335463258785943, + "grad_norm": 0.1310579627752304, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 668 + }, + { + "epoch": 0.5343450479233227, + "grad_norm": 0.28287971019744873, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 669 + }, + { + "epoch": 0.5351437699680511, + "grad_norm": 0.3457719385623932, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 670 + }, + { + "epoch": 0.5359424920127795, + "grad_norm": 0.31690946221351624, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 671 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.19356760382652283, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 672 + }, + { + "epoch": 0.5375399361022364, + "grad_norm": 0.05940595269203186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 673 + }, + { + "epoch": 0.5383386581469649, + "grad_norm": 0.20772181451320648, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 674 + }, + { + "epoch": 0.5391373801916933, + "grad_norm": 0.3093980848789215, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 675 + }, + { + "epoch": 0.5399361022364217, + "grad_norm": 0.2632107734680176, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 676 + }, + { + "epoch": 0.5407348242811502, + "grad_norm": 0.12365782260894775, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 677 + }, + { + "epoch": 0.5415335463258786, + "grad_norm": 0.07215466350317001, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 678 + }, + { + "epoch": 0.542332268370607, + "grad_norm": 0.16745947301387787, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 679 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 0.14418186247348785, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 680 + }, + { + "epoch": 0.5439297124600639, + "grad_norm": 0.048094023019075394, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 681 + }, + { + "epoch": 0.5447284345047924, + "grad_norm": 0.10100048035383224, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 682 + }, + { + "epoch": 0.5455271565495208, + "grad_norm": 0.13719545304775238, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 683 + }, + { + "epoch": 0.5463258785942492, + "grad_norm": 0.16066808998584747, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 684 + }, + { + "epoch": 0.5471246006389776, + "grad_norm": 0.19201414287090302, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 685 + }, + { + "epoch": 0.547923322683706, + "grad_norm": 0.19783100485801697, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 686 + }, + { + "epoch": 0.5487220447284346, + "grad_norm": 0.1431797295808792, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 687 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.04368956387042999, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 688 + }, + { + "epoch": 0.5503194888178914, + "grad_norm": 0.12395253777503967, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 689 + }, + { + "epoch": 0.5511182108626198, + "grad_norm": 0.16278770565986633, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 690 + }, + { + "epoch": 0.5519169329073482, + "grad_norm": 0.15368889272212982, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 691 + }, + { + "epoch": 0.5527156549520766, + "grad_norm": 0.10195931792259216, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 692 + }, + { + "epoch": 0.5535143769968051, + "grad_norm": 0.03421236202120781, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 693 + }, + { + "epoch": 0.5543130990415336, + "grad_norm": 0.09549148380756378, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 694 + }, + { + "epoch": 0.555111821086262, + "grad_norm": 0.17825989425182343, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 695 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 0.25296247005462646, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 696 + }, + { + "epoch": 0.5567092651757188, + "grad_norm": 0.27566400170326233, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 697 + }, + { + "epoch": 0.5575079872204473, + "grad_norm": 0.22609780728816986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 698 + }, + { + "epoch": 0.5583067092651757, + "grad_norm": 0.10555832833051682, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 699 + }, + { + "epoch": 0.5591054313099042, + "grad_norm": 0.1309640258550644, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 700 + }, + { + "epoch": 0.5599041533546326, + "grad_norm": 0.3434476852416992, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 701 + }, + { + "epoch": 0.560702875399361, + "grad_norm": 0.4559882581233978, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 702 + }, + { + "epoch": 0.5615015974440895, + "grad_norm": 0.390683650970459, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 703 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.14178164303302765, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 704 + }, + { + "epoch": 0.5630990415335463, + "grad_norm": 0.19113974273204803, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 705 + }, + { + "epoch": 0.5638977635782748, + "grad_norm": 0.38376086950302124, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 706 + }, + { + "epoch": 0.5646964856230032, + "grad_norm": 0.3486707806587219, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 707 + }, + { + "epoch": 0.5654952076677316, + "grad_norm": 0.14712302386760712, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 708 + }, + { + "epoch": 0.5662939297124601, + "grad_norm": 0.11827494204044342, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 709 + }, + { + "epoch": 0.5670926517571885, + "grad_norm": 0.27573689818382263, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 710 + }, + { + "epoch": 0.5678913738019169, + "grad_norm": 0.2983379065990448, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 711 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 0.2019582986831665, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 712 + }, + { + "epoch": 0.5694888178913738, + "grad_norm": 0.04186725243926048, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 713 + }, + { + "epoch": 0.5702875399361023, + "grad_norm": 0.16714231669902802, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 714 + }, + { + "epoch": 0.5710862619808307, + "grad_norm": 0.24982011318206787, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 715 + }, + { + "epoch": 0.5718849840255591, + "grad_norm": 0.22021397948265076, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 716 + }, + { + "epoch": 0.5726837060702875, + "grad_norm": 0.09717470407485962, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 717 + }, + { + "epoch": 0.5734824281150159, + "grad_norm": 0.10214962065219879, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 718 + }, + { + "epoch": 0.5742811501597445, + "grad_norm": 0.15325960516929626, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 719 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.11207877099514008, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 720 + }, + { + "epoch": 0.5758785942492013, + "grad_norm": 0.05425047129392624, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 721 + }, + { + "epoch": 0.5766773162939297, + "grad_norm": 0.0703732892870903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 722 + }, + { + "epoch": 0.5774760383386581, + "grad_norm": 0.10577918589115143, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 723 + }, + { + "epoch": 0.5782747603833865, + "grad_norm": 0.13230514526367188, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 724 + }, + { + "epoch": 0.579073482428115, + "grad_norm": 0.1878778040409088, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 725 + }, + { + "epoch": 0.5798722044728435, + "grad_norm": 0.19956567883491516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 726 + }, + { + "epoch": 0.5806709265175719, + "grad_norm": 0.13732020556926727, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 727 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 0.09844338148832321, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 728 + }, + { + "epoch": 0.5822683706070287, + "grad_norm": 0.056577637791633606, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 729 + }, + { + "epoch": 0.5830670926517572, + "grad_norm": 0.0835585743188858, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 730 + }, + { + "epoch": 0.5838658146964856, + "grad_norm": 0.0910082757472992, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 731 + }, + { + "epoch": 0.5846645367412141, + "grad_norm": 0.0659257099032402, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 732 + }, + { + "epoch": 0.5854632587859425, + "grad_norm": 0.09342535585165024, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 733 + }, + { + "epoch": 0.5862619808306709, + "grad_norm": 0.0627603679895401, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 734 + }, + { + "epoch": 0.5870607028753994, + "grad_norm": 0.10535050183534622, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 735 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.13628117740154266, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 736 + }, + { + "epoch": 0.5886581469648562, + "grad_norm": 0.0715300589799881, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 737 + }, + { + "epoch": 0.5894568690095847, + "grad_norm": 0.10892884433269501, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 738 + }, + { + "epoch": 0.5902555910543131, + "grad_norm": 0.09805259853601456, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 739 + }, + { + "epoch": 0.5910543130990416, + "grad_norm": 0.14491751790046692, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 740 + }, + { + "epoch": 0.59185303514377, + "grad_norm": 0.15448585152626038, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 741 + }, + { + "epoch": 0.5926517571884984, + "grad_norm": 0.08218494802713394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 742 + }, + { + "epoch": 0.5934504792332268, + "grad_norm": 0.16311237215995789, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 743 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 0.10310494899749756, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 744 + }, + { + "epoch": 0.5950479233226837, + "grad_norm": 0.1511978805065155, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 745 + }, + { + "epoch": 0.5958466453674122, + "grad_norm": 0.20440778136253357, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 746 + }, + { + "epoch": 0.5966453674121406, + "grad_norm": 0.20918506383895874, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 747 + }, + { + "epoch": 0.597444089456869, + "grad_norm": 0.20070627331733704, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 748 + }, + { + "epoch": 0.5982428115015974, + "grad_norm": 0.1142180860042572, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 749 + }, + { + "epoch": 0.5990415335463258, + "grad_norm": 0.09418357163667679, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 750 + }, + { + "epoch": 0.5998402555910544, + "grad_norm": 0.24306562542915344, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 751 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.3208121955394745, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 752 + }, + { + "epoch": 0.6014376996805112, + "grad_norm": 0.3070276081562042, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 753 + }, + { + "epoch": 0.6022364217252396, + "grad_norm": 0.17130877077579498, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 754 + }, + { + "epoch": 0.603035143769968, + "grad_norm": 0.0733935534954071, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 755 + }, + { + "epoch": 0.6038338658146964, + "grad_norm": 0.25525134801864624, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 756 + }, + { + "epoch": 0.604632587859425, + "grad_norm": 0.39397957921028137, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 757 + }, + { + "epoch": 0.6054313099041534, + "grad_norm": 0.39015471935272217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 758 + }, + { + "epoch": 0.6062300319488818, + "grad_norm": 0.1757609099149704, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 759 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 0.19901637732982635, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 760 + }, + { + "epoch": 0.6078274760383386, + "grad_norm": 0.46885979175567627, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 761 + }, + { + "epoch": 0.6086261980830671, + "grad_norm": 0.4650067687034607, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 762 + }, + { + "epoch": 0.6094249201277955, + "grad_norm": 0.16624194383621216, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 763 + }, + { + "epoch": 0.610223642172524, + "grad_norm": 0.23347698152065277, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 764 + }, + { + "epoch": 0.6110223642172524, + "grad_norm": 0.40192991495132446, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 765 + }, + { + "epoch": 0.6118210862619808, + "grad_norm": 0.33640867471694946, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 766 + }, + { + "epoch": 0.6126198083067093, + "grad_norm": 0.11979667842388153, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 767 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.17994286119937897, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 768 + }, + { + "epoch": 0.6142172523961661, + "grad_norm": 0.2693847715854645, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 769 + }, + { + "epoch": 0.6150159744408946, + "grad_norm": 0.2041584849357605, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 770 + }, + { + "epoch": 0.615814696485623, + "grad_norm": 0.052040908485651016, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 771 + }, + { + "epoch": 0.6166134185303515, + "grad_norm": 0.18652868270874023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 772 + }, + { + "epoch": 0.6174121405750799, + "grad_norm": 0.26122182607650757, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 773 + }, + { + "epoch": 0.6182108626198083, + "grad_norm": 0.15385891497135162, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 774 + }, + { + "epoch": 0.6190095846645367, + "grad_norm": 0.09217085689306259, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 775 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 0.23316404223442078, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 776 + }, + { + "epoch": 0.6206070287539937, + "grad_norm": 0.24094274640083313, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 777 + }, + { + "epoch": 0.6214057507987221, + "grad_norm": 0.08518059551715851, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 778 + }, + { + "epoch": 0.6222044728434505, + "grad_norm": 0.11076594144105911, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 779 + }, + { + "epoch": 0.6230031948881789, + "grad_norm": 0.1963978409767151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 780 + }, + { + "epoch": 0.6238019169329073, + "grad_norm": 0.1526973396539688, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 781 + }, + { + "epoch": 0.6246006389776357, + "grad_norm": 0.09434971958398819, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 782 + }, + { + "epoch": 0.6253993610223643, + "grad_norm": 0.2677021622657776, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 783 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.2885434329509735, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 784 + }, + { + "epoch": 0.6269968051118211, + "grad_norm": 0.14111816883087158, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 785 + }, + { + "epoch": 0.6277955271565495, + "grad_norm": 0.06594719737768173, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 786 + }, + { + "epoch": 0.6285942492012779, + "grad_norm": 0.09837283194065094, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 787 + }, + { + "epoch": 0.6293929712460063, + "grad_norm": 0.06089933589100838, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 788 + }, + { + "epoch": 0.6301916932907349, + "grad_norm": 0.16248181462287903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 789 + }, + { + "epoch": 0.6309904153354633, + "grad_norm": 0.298454612493515, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 790 + }, + { + "epoch": 0.6317891373801917, + "grad_norm": 0.3365437090396881, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 791 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 0.22858452796936035, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 792 + }, + { + "epoch": 0.6333865814696485, + "grad_norm": 0.04849984869360924, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 793 + }, + { + "epoch": 0.634185303514377, + "grad_norm": 0.24791331589221954, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 794 + }, + { + "epoch": 0.6349840255591054, + "grad_norm": 0.3028055727481842, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 795 + }, + { + "epoch": 0.6357827476038339, + "grad_norm": 0.15674540400505066, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 796 + }, + { + "epoch": 0.6365814696485623, + "grad_norm": 0.08521793782711029, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 797 + }, + { + "epoch": 0.6373801916932907, + "grad_norm": 0.21750952303409576, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 798 + }, + { + "epoch": 0.6381789137380192, + "grad_norm": 0.18880338966846466, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 799 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.06699419766664505, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 800 + }, + { + "epoch": 0.639776357827476, + "grad_norm": 0.08062998205423355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 801 + }, + { + "epoch": 0.6405750798722045, + "grad_norm": 0.10635658353567123, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 802 + }, + { + "epoch": 0.6413738019169329, + "grad_norm": 0.05086763948202133, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 803 + }, + { + "epoch": 0.6421725239616614, + "grad_norm": 0.09852107614278793, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 804 + }, + { + "epoch": 0.6429712460063898, + "grad_norm": 0.11290771514177322, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 805 + }, + { + "epoch": 0.6437699680511182, + "grad_norm": 0.15106825530529022, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 806 + }, + { + "epoch": 0.6445686900958466, + "grad_norm": 0.13646326959133148, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 807 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 0.06398668140172958, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 808 + }, + { + "epoch": 0.6461661341853036, + "grad_norm": 0.11581127345561981, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 809 + }, + { + "epoch": 0.646964856230032, + "grad_norm": 0.15684139728546143, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 810 + }, + { + "epoch": 0.6477635782747604, + "grad_norm": 0.14094121754169464, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 811 + }, + { + "epoch": 0.6485623003194888, + "grad_norm": 0.0938766822218895, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 812 + }, + { + "epoch": 0.6493610223642172, + "grad_norm": 0.06041521951556206, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 813 + }, + { + "epoch": 0.6501597444089456, + "grad_norm": 0.13364291191101074, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 814 + }, + { + "epoch": 0.6509584664536742, + "grad_norm": 0.15577054023742676, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 815 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.1119854673743248, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 816 + }, + { + "epoch": 0.652555910543131, + "grad_norm": 0.07751357555389404, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 817 + }, + { + "epoch": 0.6533546325878594, + "grad_norm": 0.10110143572092056, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 818 + }, + { + "epoch": 0.6541533546325878, + "grad_norm": 0.19627511501312256, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 819 + }, + { + "epoch": 0.6549520766773163, + "grad_norm": 0.19837769865989685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 820 + }, + { + "epoch": 0.6557507987220448, + "grad_norm": 0.13598690927028656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 821 + }, + { + "epoch": 0.6565495207667732, + "grad_norm": 0.05950666591525078, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 822 + }, + { + "epoch": 0.6573482428115016, + "grad_norm": 0.060314662754535675, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 823 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 0.11455138027667999, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 824 + }, + { + "epoch": 0.6589456869009584, + "grad_norm": 0.16753345727920532, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 825 + }, + { + "epoch": 0.6597444089456869, + "grad_norm": 0.15707428753376007, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 826 + }, + { + "epoch": 0.6605431309904153, + "grad_norm": 0.07224153727293015, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 827 + }, + { + "epoch": 0.6613418530351438, + "grad_norm": 0.10538042336702347, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 828 + }, + { + "epoch": 0.6621405750798722, + "grad_norm": 0.18855130672454834, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 829 + }, + { + "epoch": 0.6629392971246006, + "grad_norm": 0.17752179503440857, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 830 + }, + { + "epoch": 0.6637380191693291, + "grad_norm": 0.10109171271324158, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 831 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.15006190538406372, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 832 + }, + { + "epoch": 0.6653354632587859, + "grad_norm": 0.2701014578342438, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 833 + }, + { + "epoch": 0.6661341853035144, + "grad_norm": 0.2607312500476837, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 834 + }, + { + "epoch": 0.6669329073482428, + "grad_norm": 0.19712841510772705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 835 + }, + { + "epoch": 0.6677316293929713, + "grad_norm": 0.0839366614818573, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 836 + }, + { + "epoch": 0.6685303514376997, + "grad_norm": 0.1595088541507721, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 837 + }, + { + "epoch": 0.6693290734824281, + "grad_norm": 0.2773466408252716, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 838 + }, + { + "epoch": 0.6701277955271565, + "grad_norm": 0.24616314470767975, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 839 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 0.15596427023410797, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 840 + }, + { + "epoch": 0.6717252396166135, + "grad_norm": 0.047822993248701096, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 841 + }, + { + "epoch": 0.6725239616613419, + "grad_norm": 0.17692670226097107, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 842 + }, + { + "epoch": 0.6733226837060703, + "grad_norm": 0.1742856502532959, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 843 + }, + { + "epoch": 0.6741214057507987, + "grad_norm": 0.15347127616405487, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 844 + }, + { + "epoch": 0.6749201277955271, + "grad_norm": 0.18238374590873718, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 845 + }, + { + "epoch": 0.6757188498402555, + "grad_norm": 0.1524323672056198, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 846 + }, + { + "epoch": 0.6765175718849841, + "grad_norm": 0.1820068210363388, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 847 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.2010941058397293, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 848 + }, + { + "epoch": 0.6781150159744409, + "grad_norm": 0.16428111493587494, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 849 + }, + { + "epoch": 0.6789137380191693, + "grad_norm": 0.1538572460412979, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 850 + }, + { + "epoch": 0.6797124600638977, + "grad_norm": 0.057427916675806046, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 851 + }, + { + "epoch": 0.6805111821086262, + "grad_norm": 0.08329081535339355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 852 + }, + { + "epoch": 0.6813099041533547, + "grad_norm": 0.05685174837708473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 853 + }, + { + "epoch": 0.6821086261980831, + "grad_norm": 0.15277032554149628, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 854 + }, + { + "epoch": 0.6829073482428115, + "grad_norm": 0.24243640899658203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 855 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 0.28722453117370605, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 856 + }, + { + "epoch": 0.6845047923322684, + "grad_norm": 0.1997309774160385, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 857 + }, + { + "epoch": 0.6853035143769968, + "grad_norm": 0.061719026416540146, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 858 + }, + { + "epoch": 0.6861022364217252, + "grad_norm": 0.23425672948360443, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 859 + }, + { + "epoch": 0.6869009584664537, + "grad_norm": 0.350109726190567, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 860 + }, + { + "epoch": 0.6876996805111821, + "grad_norm": 0.34444838762283325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 861 + }, + { + "epoch": 0.6884984025559105, + "grad_norm": 0.15325413644313812, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 862 + }, + { + "epoch": 0.689297124600639, + "grad_norm": 0.1227702870965004, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 863 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 0.24337291717529297, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 864 + }, + { + "epoch": 0.6908945686900958, + "grad_norm": 0.24047589302062988, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 865 + }, + { + "epoch": 0.6916932907348243, + "grad_norm": 0.13576050102710724, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 866 + }, + { + "epoch": 0.6924920127795527, + "grad_norm": 0.0503714494407177, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 867 + }, + { + "epoch": 0.6932907348242812, + "grad_norm": 0.1292860060930252, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 868 + }, + { + "epoch": 0.6940894568690096, + "grad_norm": 0.14698486030101776, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 869 + }, + { + "epoch": 0.694888178913738, + "grad_norm": 0.07720573991537094, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 870 + }, + { + "epoch": 0.6956869009584664, + "grad_norm": 0.1604471504688263, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 871 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 0.32734861969947815, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 872 + }, + { + "epoch": 0.6972843450479234, + "grad_norm": 0.32366684079170227, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 873 + }, + { + "epoch": 0.6980830670926518, + "grad_norm": 0.18428802490234375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 874 + }, + { + "epoch": 0.6988817891373802, + "grad_norm": 0.07498858869075775, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 875 + }, + { + "epoch": 0.6996805111821086, + "grad_norm": 0.24449816346168518, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 876 + }, + { + "epoch": 0.700479233226837, + "grad_norm": 0.26649829745292664, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 877 + }, + { + "epoch": 0.7012779552715654, + "grad_norm": 0.1315024197101593, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 878 + }, + { + "epoch": 0.702076677316294, + "grad_norm": 0.10907325148582458, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 879 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.2364589273929596, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 880 + }, + { + "epoch": 0.7036741214057508, + "grad_norm": 0.1663885861635208, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 881 + }, + { + "epoch": 0.7044728434504792, + "grad_norm": 0.0596470907330513, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 882 + }, + { + "epoch": 0.7052715654952076, + "grad_norm": 0.1519233137369156, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 883 + }, + { + "epoch": 0.7060702875399361, + "grad_norm": 0.23089520633220673, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 884 + }, + { + "epoch": 0.7068690095846646, + "grad_norm": 0.20667214691638947, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 885 + }, + { + "epoch": 0.707667731629393, + "grad_norm": 0.10739922523498535, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 886 + }, + { + "epoch": 0.7084664536741214, + "grad_norm": 0.04334057494997978, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 887 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 0.15619881451129913, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 888 + }, + { + "epoch": 0.7100638977635783, + "grad_norm": 0.26618269085884094, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 889 + }, + { + "epoch": 0.7108626198083067, + "grad_norm": 0.1834406554698944, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 890 + }, + { + "epoch": 0.7116613418530351, + "grad_norm": 0.08332087099552155, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 891 + }, + { + "epoch": 0.7124600638977636, + "grad_norm": 0.23721523582935333, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 892 + }, + { + "epoch": 0.713258785942492, + "grad_norm": 0.2912815809249878, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 893 + }, + { + "epoch": 0.7140575079872205, + "grad_norm": 0.25534820556640625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 894 + }, + { + "epoch": 0.7148562300319489, + "grad_norm": 0.14200575649738312, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 895 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.08668249845504761, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 896 + }, + { + "epoch": 0.7164536741214057, + "grad_norm": 0.2358543574810028, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 897 + }, + { + "epoch": 0.7172523961661342, + "grad_norm": 0.2729748487472534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 898 + }, + { + "epoch": 0.7180511182108626, + "grad_norm": 0.14862589538097382, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 899 + }, + { + "epoch": 0.7188498402555911, + "grad_norm": 0.14500044286251068, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 900 + }, + { + "epoch": 0.7196485623003195, + "grad_norm": 0.28659892082214355, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 901 + }, + { + "epoch": 0.7204472843450479, + "grad_norm": 0.2974075376987457, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 902 + }, + { + "epoch": 0.7212460063897763, + "grad_norm": 0.07839605212211609, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 903 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 0.2542141079902649, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 904 + }, + { + "epoch": 0.7228434504792333, + "grad_norm": 0.357192724943161, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 905 + }, + { + "epoch": 0.7236421725239617, + "grad_norm": 0.21535371243953705, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 906 + }, + { + "epoch": 0.7244408945686901, + "grad_norm": 0.08053386211395264, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 907 + }, + { + "epoch": 0.7252396166134185, + "grad_norm": 0.22670729458332062, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 908 + }, + { + "epoch": 0.7260383386581469, + "grad_norm": 0.21510791778564453, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 909 + }, + { + "epoch": 0.7268370607028753, + "grad_norm": 0.07556216418743134, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 910 + }, + { + "epoch": 0.7276357827476039, + "grad_norm": 0.08772645890712738, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 911 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.2531013488769531, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 912 + }, + { + "epoch": 0.7292332268370607, + "grad_norm": 0.06658858805894852, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 913 + }, + { + "epoch": 0.7300319488817891, + "grad_norm": 0.09869293123483658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 914 + }, + { + "epoch": 0.7308306709265175, + "grad_norm": 0.17758162319660187, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 915 + }, + { + "epoch": 0.731629392971246, + "grad_norm": 0.16267521679401398, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 916 + }, + { + "epoch": 0.7324281150159745, + "grad_norm": 0.09948690980672836, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 917 + }, + { + "epoch": 0.7332268370607029, + "grad_norm": 0.05900302529335022, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 918 + }, + { + "epoch": 0.7340255591054313, + "grad_norm": 0.08200150728225708, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 919 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 0.09217624366283417, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 920 + }, + { + "epoch": 0.7356230031948882, + "grad_norm": 0.12414196133613586, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 921 + }, + { + "epoch": 0.7364217252396166, + "grad_norm": 0.131890669465065, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 922 + }, + { + "epoch": 0.737220447284345, + "grad_norm": 0.1187182292342186, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 923 + }, + { + "epoch": 0.7380191693290735, + "grad_norm": 0.09890205413103104, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 924 + }, + { + "epoch": 0.7388178913738019, + "grad_norm": 0.06730851531028748, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 925 + }, + { + "epoch": 0.7396166134185304, + "grad_norm": 0.038627006113529205, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 926 + }, + { + "epoch": 0.7404153354632588, + "grad_norm": 0.07148899137973785, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 927 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 0.05876476690173149, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 928 + }, + { + "epoch": 0.7420127795527156, + "grad_norm": 0.11069595813751221, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 929 + }, + { + "epoch": 0.7428115015974441, + "grad_norm": 0.10409362614154816, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 930 + }, + { + "epoch": 0.7436102236421726, + "grad_norm": 0.08115468919277191, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 931 + }, + { + "epoch": 0.744408945686901, + "grad_norm": 0.14105193316936493, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 932 + }, + { + "epoch": 0.7452076677316294, + "grad_norm": 0.07780246436595917, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 933 + }, + { + "epoch": 0.7460063897763578, + "grad_norm": 0.08895678073167801, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 934 + }, + { + "epoch": 0.7468051118210862, + "grad_norm": 0.10844068974256516, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 935 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 0.07179753482341766, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 936 + }, + { + "epoch": 0.7484025559105432, + "grad_norm": 0.11107192933559418, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 937 + }, + { + "epoch": 0.7492012779552716, + "grad_norm": 0.2845052480697632, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 938 + }, + { + "epoch": 0.75, + "grad_norm": 0.41480058431625366, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 939 + }, + { + "epoch": 0.7507987220447284, + "grad_norm": 0.3101426064968109, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 940 + }, + { + "epoch": 0.7515974440894568, + "grad_norm": 0.09521801024675369, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 941 + }, + { + "epoch": 0.7523961661341853, + "grad_norm": 0.18613341450691223, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 942 + }, + { + "epoch": 0.7531948881789138, + "grad_norm": 0.2665672302246094, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 943 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.20693817734718323, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 944 + }, + { + "epoch": 0.7547923322683706, + "grad_norm": 0.05853262171149254, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 945 + }, + { + "epoch": 0.755591054313099, + "grad_norm": 0.22123664617538452, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 946 + }, + { + "epoch": 0.7563897763578274, + "grad_norm": 0.2845379114151001, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 947 + }, + { + "epoch": 0.7571884984025559, + "grad_norm": 0.20357397198677063, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 948 + }, + { + "epoch": 0.7579872204472844, + "grad_norm": 0.0897352546453476, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 949 + }, + { + "epoch": 0.7587859424920128, + "grad_norm": 0.06572771817445755, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 950 + }, + { + "epoch": 0.7595846645367412, + "grad_norm": 0.09441806375980377, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 951 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 0.06848953664302826, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 952 + }, + { + "epoch": 0.7611821086261981, + "grad_norm": 0.127177432179451, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 953 + }, + { + "epoch": 0.7619808306709265, + "grad_norm": 0.25466713309288025, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 954 + }, + { + "epoch": 0.762779552715655, + "grad_norm": 0.32952556014060974, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 955 + }, + { + "epoch": 0.7635782747603834, + "grad_norm": 0.2976897358894348, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 956 + }, + { + "epoch": 0.7643769968051118, + "grad_norm": 0.17444387078285217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 957 + }, + { + "epoch": 0.7651757188498403, + "grad_norm": 0.10458981990814209, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 958 + }, + { + "epoch": 0.7659744408945687, + "grad_norm": 0.07028939574956894, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 959 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.1888386309146881, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 960 + }, + { + "epoch": 0.7675718849840255, + "grad_norm": 0.19400012493133545, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 961 + }, + { + "epoch": 0.768370607028754, + "grad_norm": 0.12069790065288544, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 962 + }, + { + "epoch": 0.7691693290734825, + "grad_norm": 0.06206851452589035, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 963 + }, + { + "epoch": 0.7699680511182109, + "grad_norm": 0.07195326685905457, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 964 + }, + { + "epoch": 0.7707667731629393, + "grad_norm": 0.09240477532148361, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 965 + }, + { + "epoch": 0.7715654952076677, + "grad_norm": 0.04433378204703331, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 966 + }, + { + "epoch": 0.7723642172523961, + "grad_norm": 0.07411819696426392, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 967 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 0.11440210789442062, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 968 + }, + { + "epoch": 0.7739616613418531, + "grad_norm": 0.23913118243217468, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 969 + }, + { + "epoch": 0.7747603833865815, + "grad_norm": 0.31028202176094055, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 970 + }, + { + "epoch": 0.7755591054313099, + "grad_norm": 0.3343825936317444, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 971 + }, + { + "epoch": 0.7763578274760383, + "grad_norm": 0.2559935748577118, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 972 + }, + { + "epoch": 0.7771565495207667, + "grad_norm": 0.05685359239578247, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 973 + }, + { + "epoch": 0.7779552715654952, + "grad_norm": 0.1760183721780777, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 974 + }, + { + "epoch": 0.7787539936102237, + "grad_norm": 0.25240832567214966, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 975 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.13724291324615479, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 976 + }, + { + "epoch": 0.7803514376996805, + "grad_norm": 0.11687567830085754, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 977 + }, + { + "epoch": 0.7811501597444089, + "grad_norm": 0.31319329142570496, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 978 + }, + { + "epoch": 0.7819488817891374, + "grad_norm": 0.3297184705734253, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 979 + }, + { + "epoch": 0.7827476038338658, + "grad_norm": 0.19443389773368835, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 980 + }, + { + "epoch": 0.7835463258785943, + "grad_norm": 0.04911043494939804, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 981 + }, + { + "epoch": 0.7843450479233227, + "grad_norm": 0.19837717711925507, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 982 + }, + { + "epoch": 0.7851437699680511, + "grad_norm": 0.23165349662303925, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 983 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 0.12156365066766739, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 984 + }, + { + "epoch": 0.786741214057508, + "grad_norm": 0.1305016428232193, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 985 + }, + { + "epoch": 0.7875399361022364, + "grad_norm": 0.12228422611951828, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 986 + }, + { + "epoch": 0.7883386581469649, + "grad_norm": 0.09014695137739182, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 987 + }, + { + "epoch": 0.7891373801916933, + "grad_norm": 0.060052234679460526, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 988 + }, + { + "epoch": 0.7899361022364217, + "grad_norm": 0.17842933535575867, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 989 + }, + { + "epoch": 0.7907348242811502, + "grad_norm": 0.2823020815849304, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 990 + }, + { + "epoch": 0.7915335463258786, + "grad_norm": 0.2571483254432678, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 991 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.11443623155355453, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 992 + }, + { + "epoch": 0.7931309904153354, + "grad_norm": 0.09048285335302353, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 993 + }, + { + "epoch": 0.7939297124600639, + "grad_norm": 0.1863749772310257, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 994 + }, + { + "epoch": 0.7947284345047924, + "grad_norm": 0.1481461524963379, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 995 + }, + { + "epoch": 0.7955271565495208, + "grad_norm": 0.06870540231466293, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 996 + }, + { + "epoch": 0.7963258785942492, + "grad_norm": 0.04223543405532837, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 997 + }, + { + "epoch": 0.7971246006389776, + "grad_norm": 0.04194851219654083, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 998 + }, + { + "epoch": 0.797923322683706, + "grad_norm": 0.03982497751712799, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 999 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 0.20985758304595947, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1000 + }, + { + "epoch": 0.799520766773163, + "grad_norm": 0.11346526443958282, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1001 + }, + { + "epoch": 0.8003194888178914, + "grad_norm": 0.16594401001930237, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1002 + }, + { + "epoch": 0.8011182108626198, + "grad_norm": 0.1788545846939087, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1003 + }, + { + "epoch": 0.8019169329073482, + "grad_norm": 0.07928512245416641, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1004 + }, + { + "epoch": 0.8027156549520766, + "grad_norm": 0.0953991562128067, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1005 + }, + { + "epoch": 0.8035143769968051, + "grad_norm": 0.2052081823348999, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1006 + }, + { + "epoch": 0.8043130990415336, + "grad_norm": 0.1999465525150299, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1007 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.09821965545415878, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1008 + }, + { + "epoch": 0.8059105431309904, + "grad_norm": 0.0762021467089653, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1009 + }, + { + "epoch": 0.8067092651757188, + "grad_norm": 0.20475991070270538, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 0.8075079872204473, + "grad_norm": 0.23028631508350372, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1011 + }, + { + "epoch": 0.8083067092651757, + "grad_norm": 0.12122747302055359, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.8091054313099042, + "grad_norm": 0.08124672621488571, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1013 + }, + { + "epoch": 0.8099041533546326, + "grad_norm": 0.21313415467739105, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1014 + }, + { + "epoch": 0.810702875399361, + "grad_norm": 0.311813622713089, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1015 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 0.3032541275024414, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1016 + }, + { + "epoch": 0.8123003194888179, + "grad_norm": 0.21727560460567474, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1017 + }, + { + "epoch": 0.8130990415335463, + "grad_norm": 0.0620480477809906, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1018 + }, + { + "epoch": 0.8138977635782748, + "grad_norm": 0.20105740427970886, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1019 + }, + { + "epoch": 0.8146964856230032, + "grad_norm": 0.28996244072914124, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1020 + }, + { + "epoch": 0.8154952076677316, + "grad_norm": 0.22115157544612885, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1021 + }, + { + "epoch": 0.8162939297124601, + "grad_norm": 0.10071029514074326, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1022 + }, + { + "epoch": 0.8170926517571885, + "grad_norm": 0.12363877147436142, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1023 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.29970163106918335, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1024 + }, + { + "epoch": 0.8186900958466453, + "grad_norm": 0.32754749059677124, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1025 + }, + { + "epoch": 0.8194888178913738, + "grad_norm": 0.20028825104236603, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1026 + }, + { + "epoch": 0.8202875399361023, + "grad_norm": 0.08162792772054672, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1027 + }, + { + "epoch": 0.8210862619808307, + "grad_norm": 0.27463749051094055, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1028 + }, + { + "epoch": 0.8218849840255591, + "grad_norm": 0.30335354804992676, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1029 + }, + { + "epoch": 0.8226837060702875, + "grad_norm": 0.12106633186340332, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1030 + }, + { + "epoch": 0.8234824281150159, + "grad_norm": 0.16331955790519714, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1031 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 0.2764187455177307, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1032 + }, + { + "epoch": 0.8250798722044729, + "grad_norm": 0.20136456191539764, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1033 + }, + { + "epoch": 0.8258785942492013, + "grad_norm": 0.06438590586185455, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1034 + }, + { + "epoch": 0.8266773162939297, + "grad_norm": 0.18764367699623108, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1035 + }, + { + "epoch": 0.8274760383386581, + "grad_norm": 0.20327645540237427, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1036 + }, + { + "epoch": 0.8282747603833865, + "grad_norm": 0.08825036138296127, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1037 + }, + { + "epoch": 0.829073482428115, + "grad_norm": 0.11037785559892654, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1038 + }, + { + "epoch": 0.8298722044728435, + "grad_norm": 0.18273280560970306, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1039 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.16820372641086578, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1040 + }, + { + "epoch": 0.8314696485623003, + "grad_norm": 0.06250625103712082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1041 + }, + { + "epoch": 0.8322683706070287, + "grad_norm": 0.12141115218400955, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1042 + }, + { + "epoch": 0.8330670926517572, + "grad_norm": 0.13594450056552887, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1043 + }, + { + "epoch": 0.8338658146964856, + "grad_norm": 0.16069599986076355, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1044 + }, + { + "epoch": 0.8346645367412141, + "grad_norm": 0.11631255596876144, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1045 + }, + { + "epoch": 0.8354632587859425, + "grad_norm": 0.050075192004442215, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1046 + }, + { + "epoch": 0.8362619808306709, + "grad_norm": 0.06317511945962906, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1047 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 0.09078527241945267, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1048 + }, + { + "epoch": 0.8378594249201278, + "grad_norm": 0.1618194878101349, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1049 + }, + { + "epoch": 0.8386581469648562, + "grad_norm": 0.2044777274131775, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1050 + }, + { + "epoch": 0.8394568690095847, + "grad_norm": 0.20439067482948303, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.8402555910543131, + "grad_norm": 0.1967901587486267, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1052 + }, + { + "epoch": 0.8410543130990416, + "grad_norm": 0.06829354166984558, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1053 + }, + { + "epoch": 0.84185303514377, + "grad_norm": 0.12168806046247482, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1054 + }, + { + "epoch": 0.8426517571884984, + "grad_norm": 0.23461978137493134, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1055 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.28916484117507935, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1056 + }, + { + "epoch": 0.8442492012779552, + "grad_norm": 0.21827733516693115, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1057 + }, + { + "epoch": 0.8450479233226837, + "grad_norm": 0.045396093279123306, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1058 + }, + { + "epoch": 0.8458466453674122, + "grad_norm": 0.2391543984413147, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1059 + }, + { + "epoch": 0.8466453674121406, + "grad_norm": 0.2916122078895569, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1060 + }, + { + "epoch": 0.847444089456869, + "grad_norm": 0.1589413434267044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1061 + }, + { + "epoch": 0.8482428115015974, + "grad_norm": 0.14869733154773712, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1062 + }, + { + "epoch": 0.8490415335463258, + "grad_norm": 0.3719956874847412, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 1063 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1064 + }, + { + "epoch": 0.8506389776357828, + "grad_norm": 0.22647641599178314, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1065 + }, + { + "epoch": 0.8514376996805112, + "grad_norm": 0.14329837262630463, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1066 + }, + { + "epoch": 0.8522364217252396, + "grad_norm": 0.2508337199687958, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1067 + }, + { + "epoch": 0.853035143769968, + "grad_norm": 0.16483807563781738, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1068 + }, + { + "epoch": 0.8538338658146964, + "grad_norm": 0.08231265842914581, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1069 + }, + { + "epoch": 0.854632587859425, + "grad_norm": 0.15707719326019287, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1070 + }, + { + "epoch": 0.8554313099041534, + "grad_norm": 0.1741408407688141, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1071 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.06281771510839462, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1072 + }, + { + "epoch": 0.8570287539936102, + "grad_norm": 0.10936494171619415, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1073 + }, + { + "epoch": 0.8578274760383386, + "grad_norm": 0.08680932223796844, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1074 + }, + { + "epoch": 0.8586261980830671, + "grad_norm": 0.05679824575781822, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1075 + }, + { + "epoch": 0.8594249201277955, + "grad_norm": 0.07635466009378433, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1076 + }, + { + "epoch": 0.860223642172524, + "grad_norm": 0.08391202241182327, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 1077 + }, + { + "epoch": 0.8610223642172524, + "grad_norm": 0.044910602271556854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1078 + }, + { + "epoch": 0.8618210862619808, + "grad_norm": 0.07833745330572128, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 1079 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 0.11653397232294083, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1080 + }, + { + "epoch": 0.8634185303514377, + "grad_norm": 0.09041672199964523, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1081 + }, + { + "epoch": 0.8642172523961661, + "grad_norm": 0.061735767871141434, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1082 + }, + { + "epoch": 0.8650159744408946, + "grad_norm": 0.042857520282268524, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1083 + }, + { + "epoch": 0.865814696485623, + "grad_norm": 0.040145136415958405, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1084 + }, + { + "epoch": 0.8666134185303515, + "grad_norm": 0.05785573646426201, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1085 + }, + { + "epoch": 0.8674121405750799, + "grad_norm": 0.13503877818584442, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1086 + }, + { + "epoch": 0.8682108626198083, + "grad_norm": 0.16243800520896912, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1087 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.13211014866828918, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1088 + }, + { + "epoch": 0.8698083067092651, + "grad_norm": 0.08136262744665146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1089 + }, + { + "epoch": 0.8706070287539937, + "grad_norm": 0.07881205528974533, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1090 + }, + { + "epoch": 0.8714057507987221, + "grad_norm": 0.1660437136888504, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1091 + }, + { + "epoch": 0.8722044728434505, + "grad_norm": 0.1955040693283081, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1092 + }, + { + "epoch": 0.8730031948881789, + "grad_norm": 0.18039803206920624, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1093 + }, + { + "epoch": 0.8738019169329073, + "grad_norm": 0.13832250237464905, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1094 + }, + { + "epoch": 0.8746006389776357, + "grad_norm": 0.06982281059026718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1095 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 0.06607141345739365, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1096 + }, + { + "epoch": 0.8761980830670927, + "grad_norm": 0.08685869723558426, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1097 + }, + { + "epoch": 0.8769968051118211, + "grad_norm": 0.09157849103212357, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1098 + }, + { + "epoch": 0.8777955271565495, + "grad_norm": 0.05980607122182846, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1099 + }, + { + "epoch": 0.8785942492012779, + "grad_norm": 0.05037426948547363, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1100 + }, + { + "epoch": 0.8793929712460063, + "grad_norm": 0.09998175501823425, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 1101 + }, + { + "epoch": 0.8801916932907349, + "grad_norm": 0.14255133271217346, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1102 + }, + { + "epoch": 0.8809904153354633, + "grad_norm": 0.1332579255104065, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1103 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.06453413516283035, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1104 + }, + { + "epoch": 0.8825878594249201, + "grad_norm": 0.07107783854007721, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1105 + }, + { + "epoch": 0.8833865814696485, + "grad_norm": 0.14025849103927612, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1106 + }, + { + "epoch": 0.884185303514377, + "grad_norm": 0.18791186809539795, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1107 + }, + { + "epoch": 0.8849840255591054, + "grad_norm": 0.228570356965065, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1108 + }, + { + "epoch": 0.8857827476038339, + "grad_norm": 0.21574346721172333, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1109 + }, + { + "epoch": 0.8865814696485623, + "grad_norm": 0.14833906292915344, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1110 + }, + { + "epoch": 0.8873801916932907, + "grad_norm": 0.04756765812635422, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1111 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 0.13023658096790314, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1112 + }, + { + "epoch": 0.8889776357827476, + "grad_norm": 0.21199558675289154, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1113 + }, + { + "epoch": 0.889776357827476, + "grad_norm": 0.19635719060897827, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1114 + }, + { + "epoch": 0.8905750798722045, + "grad_norm": 0.14753709733486176, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1115 + }, + { + "epoch": 0.8913738019169329, + "grad_norm": 0.06639572232961655, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1116 + }, + { + "epoch": 0.8921725239616614, + "grad_norm": 0.09707840532064438, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1117 + }, + { + "epoch": 0.8929712460063898, + "grad_norm": 0.20057998597621918, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 1118 + }, + { + "epoch": 0.8937699680511182, + "grad_norm": 0.232718825340271, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1119 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.16340196132659912, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1120 + }, + { + "epoch": 0.895367412140575, + "grad_norm": 0.04553915560245514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 1121 + }, + { + "epoch": 0.8961661341853036, + "grad_norm": 0.12561571598052979, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1122 + }, + { + "epoch": 0.896964856230032, + "grad_norm": 0.19254666566848755, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1123 + }, + { + "epoch": 0.8977635782747604, + "grad_norm": 0.12862572073936462, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1124 + }, + { + "epoch": 0.8985623003194888, + "grad_norm": 0.051237158477306366, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1125 + }, + { + "epoch": 0.8993610223642172, + "grad_norm": 0.18603810667991638, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1126 + }, + { + "epoch": 0.9001597444089456, + "grad_norm": 0.2498294860124588, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1127 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 0.18809954822063446, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1128 + }, + { + "epoch": 0.9017571884984026, + "grad_norm": 0.06116599217057228, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1129 + }, + { + "epoch": 0.902555910543131, + "grad_norm": 0.07710137963294983, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1130 + }, + { + "epoch": 0.9033546325878594, + "grad_norm": 0.11208303272724152, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1131 + }, + { + "epoch": 0.9041533546325878, + "grad_norm": 0.11864814907312393, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1132 + }, + { + "epoch": 0.9049520766773163, + "grad_norm": 0.1261119246482849, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1133 + }, + { + "epoch": 0.9057507987220448, + "grad_norm": 0.10841526836156845, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1134 + }, + { + "epoch": 0.9065495207667732, + "grad_norm": 0.04871276393532753, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1135 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.08953645080327988, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1136 + }, + { + "epoch": 0.90814696485623, + "grad_norm": 0.1590365469455719, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1137 + }, + { + "epoch": 0.9089456869009584, + "grad_norm": 0.155691459774971, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1138 + }, + { + "epoch": 0.9097444089456869, + "grad_norm": 0.09982484579086304, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1139 + }, + { + "epoch": 0.9105431309904153, + "grad_norm": 0.08257611095905304, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1140 + }, + { + "epoch": 0.9113418530351438, + "grad_norm": 0.1036139577627182, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1141 + }, + { + "epoch": 0.9121405750798722, + "grad_norm": 0.06543707102537155, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1142 + }, + { + "epoch": 0.9129392971246006, + "grad_norm": 0.05375903844833374, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1143 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 0.13674795627593994, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1144 + }, + { + "epoch": 0.9145367412140575, + "grad_norm": 0.21575352549552917, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 1145 + }, + { + "epoch": 0.9153354632587859, + "grad_norm": 0.22478559613227844, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1146 + }, + { + "epoch": 0.9161341853035144, + "grad_norm": 0.1854555904865265, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1147 + }, + { + "epoch": 0.9169329073482428, + "grad_norm": 0.08605340123176575, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1148 + }, + { + "epoch": 0.9177316293929713, + "grad_norm": 0.14082656800746918, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1149 + }, + { + "epoch": 0.9185303514376997, + "grad_norm": 0.3214903771877289, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1150 + }, + { + "epoch": 0.9193290734824281, + "grad_norm": 0.4360012412071228, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 1151 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.3582250773906708, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1152 + }, + { + "epoch": 0.920926517571885, + "grad_norm": 0.1142783984541893, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1153 + }, + { + "epoch": 0.9217252396166135, + "grad_norm": 0.2035343497991562, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1154 + }, + { + "epoch": 0.9225239616613419, + "grad_norm": 0.3506172299385071, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1155 + }, + { + "epoch": 0.9233226837060703, + "grad_norm": 0.2129906564950943, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1156 + }, + { + "epoch": 0.9241214057507987, + "grad_norm": 0.12158108502626419, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1157 + }, + { + "epoch": 0.9249201277955271, + "grad_norm": 0.3931717872619629, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1158 + }, + { + "epoch": 0.9257188498402555, + "grad_norm": 0.36336907744407654, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1159 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 0.06781382113695145, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1160 + }, + { + "epoch": 0.9273162939297125, + "grad_norm": 0.3335910141468048, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1161 + }, + { + "epoch": 0.9281150159744409, + "grad_norm": 0.5017055869102478, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 1162 + }, + { + "epoch": 0.9289137380191693, + "grad_norm": 0.3635455071926117, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1163 + }, + { + "epoch": 0.9297124600638977, + "grad_norm": 0.06748906522989273, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1164 + }, + { + "epoch": 0.9305111821086262, + "grad_norm": 0.3723882734775543, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1165 + }, + { + "epoch": 0.9313099041533547, + "grad_norm": 0.2976631820201874, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1166 + }, + { + "epoch": 0.9321086261980831, + "grad_norm": 0.06998804211616516, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1167 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.3307324945926666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1168 + }, + { + "epoch": 0.9337060702875399, + "grad_norm": 0.29726436734199524, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1169 + }, + { + "epoch": 0.9345047923322684, + "grad_norm": 0.048596691340208054, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1170 + }, + { + "epoch": 0.9353035143769968, + "grad_norm": 0.2840823233127594, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1171 + }, + { + "epoch": 0.9361022364217252, + "grad_norm": 0.31426292657852173, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1172 + }, + { + "epoch": 0.9369009584664537, + "grad_norm": 0.16073261201381683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1173 + }, + { + "epoch": 0.9376996805111821, + "grad_norm": 0.05725392326712608, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1174 + }, + { + "epoch": 0.9384984025559105, + "grad_norm": 0.1674586981534958, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1175 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 0.13738949596881866, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1176 + }, + { + "epoch": 0.9400958466453674, + "grad_norm": 0.05350235849618912, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1177 + }, + { + "epoch": 0.9408945686900958, + "grad_norm": 0.10518805682659149, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.9416932907348243, + "grad_norm": 0.11264974623918533, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1179 + }, + { + "epoch": 0.9424920127795527, + "grad_norm": 0.06757227331399918, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1180 + }, + { + "epoch": 0.9432907348242812, + "grad_norm": 0.07214303314685822, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1181 + }, + { + "epoch": 0.9440894568690096, + "grad_norm": 0.12705406546592712, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1182 + }, + { + "epoch": 0.944888178913738, + "grad_norm": 0.09937570244073868, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 1183 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 0.05628623813390732, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1184 + }, + { + "epoch": 0.9464856230031949, + "grad_norm": 0.05685505270957947, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1185 + }, + { + "epoch": 0.9472843450479234, + "grad_norm": 0.06150783598423004, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1186 + }, + { + "epoch": 0.9480830670926518, + "grad_norm": 0.04247362166643143, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1187 + }, + { + "epoch": 0.9488817891373802, + "grad_norm": 0.05664962902665138, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1188 + }, + { + "epoch": 0.9496805111821086, + "grad_norm": 0.07421324402093887, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1189 + }, + { + "epoch": 0.950479233226837, + "grad_norm": 0.043645020574331284, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1190 + }, + { + "epoch": 0.9512779552715654, + "grad_norm": 0.0692208856344223, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1191 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 0.13804891705513, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1192 + }, + { + "epoch": 0.9528753993610224, + "grad_norm": 0.14874884486198425, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1193 + }, + { + "epoch": 0.9536741214057508, + "grad_norm": 0.08449128270149231, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1194 + }, + { + "epoch": 0.9544728434504792, + "grad_norm": 0.035032968968153, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1195 + }, + { + "epoch": 0.9552715654952076, + "grad_norm": 0.10837965458631516, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1196 + }, + { + "epoch": 0.9560702875399361, + "grad_norm": 0.17972581088542938, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1197 + }, + { + "epoch": 0.9568690095846646, + "grad_norm": 0.17075787484645844, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1198 + }, + { + "epoch": 0.957667731629393, + "grad_norm": 0.08269231766462326, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1199 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.07269515842199326, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1200 + }, + { + "epoch": 0.9592651757188498, + "grad_norm": 0.15345947444438934, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1201 + }, + { + "epoch": 0.9600638977635783, + "grad_norm": 0.19025452435016632, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1202 + }, + { + "epoch": 0.9608626198083067, + "grad_norm": 0.1782686710357666, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1203 + }, + { + "epoch": 0.9616613418530351, + "grad_norm": 0.1296931356191635, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1204 + }, + { + "epoch": 0.9624600638977636, + "grad_norm": 0.036208219826221466, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1205 + }, + { + "epoch": 0.963258785942492, + "grad_norm": 0.14282052218914032, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1206 + }, + { + "epoch": 0.9640575079872205, + "grad_norm": 0.26539498567581177, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1207 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 0.28352224826812744, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1208 + }, + { + "epoch": 0.9656549520766773, + "grad_norm": 0.14476369321346283, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1209 + }, + { + "epoch": 0.9664536741214057, + "grad_norm": 0.06859725713729858, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1210 + }, + { + "epoch": 0.9672523961661342, + "grad_norm": 0.19093726575374603, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1211 + }, + { + "epoch": 0.9680511182108626, + "grad_norm": 0.1848185807466507, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1212 + }, + { + "epoch": 0.9688498402555911, + "grad_norm": 0.05829976871609688, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1213 + }, + { + "epoch": 0.9696485623003195, + "grad_norm": 0.10105405002832413, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1214 + }, + { + "epoch": 0.9704472843450479, + "grad_norm": 0.12762011587619781, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1215 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.08238376677036285, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1216 + }, + { + "epoch": 0.9720447284345048, + "grad_norm": 0.07039444148540497, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1217 + }, + { + "epoch": 0.9728434504792333, + "grad_norm": 0.1320599615573883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1218 + }, + { + "epoch": 0.9736421725239617, + "grad_norm": 0.07799404859542847, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1219 + }, + { + "epoch": 0.9744408945686901, + "grad_norm": 0.11601961404085159, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1220 + }, + { + "epoch": 0.9752396166134185, + "grad_norm": 0.26134374737739563, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1221 + }, + { + "epoch": 0.9760383386581469, + "grad_norm": 0.275513231754303, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1222 + }, + { + "epoch": 0.9768370607028753, + "grad_norm": 0.0711631178855896, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1223 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 0.1879139244556427, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1224 + }, + { + "epoch": 0.9784345047923323, + "grad_norm": 0.24822647869586945, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1225 + }, + { + "epoch": 0.9792332268370607, + "grad_norm": 0.1244853138923645, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1226 + }, + { + "epoch": 0.9800319488817891, + "grad_norm": 0.07694529742002487, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1227 + }, + { + "epoch": 0.9808306709265175, + "grad_norm": 0.1280626803636551, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1228 + }, + { + "epoch": 0.981629392971246, + "grad_norm": 0.09127703309059143, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1229 + }, + { + "epoch": 0.9824281150159745, + "grad_norm": 0.06747932732105255, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1230 + }, + { + "epoch": 0.9832268370607029, + "grad_norm": 0.08196533471345901, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1231 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.09074689447879791, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1232 + }, + { + "epoch": 0.9848242811501597, + "grad_norm": 0.06031282991170883, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1233 + }, + { + "epoch": 0.9856230031948882, + "grad_norm": 0.07138215005397797, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1234 + }, + { + "epoch": 0.9864217252396166, + "grad_norm": 0.11056806892156601, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1235 + }, + { + "epoch": 0.987220447284345, + "grad_norm": 0.09108638018369675, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1236 + }, + { + "epoch": 0.9880191693290735, + "grad_norm": 0.0515020377933979, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1237 + }, + { + "epoch": 0.9888178913738019, + "grad_norm": 0.08467873930931091, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1238 + }, + { + "epoch": 0.9896166134185304, + "grad_norm": 0.10424523055553436, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1239 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 0.11506868153810501, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1240 + }, + { + "epoch": 0.9912140575079872, + "grad_norm": 0.13226476311683655, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1241 + }, + { + "epoch": 0.9920127795527156, + "grad_norm": 0.13714630901813507, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1242 + }, + { + "epoch": 0.9928115015974441, + "grad_norm": 0.08985403180122375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1243 + }, + { + "epoch": 0.9936102236421726, + "grad_norm": 0.1107666939496994, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 1244 + }, + { + "epoch": 0.994408945686901, + "grad_norm": 0.130653515458107, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1245 + }, + { + "epoch": 0.9952076677316294, + "grad_norm": 0.10675778985023499, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1246 + }, + { + "epoch": 0.9960063897763578, + "grad_norm": 0.042045243084430695, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1247 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.07957674562931061, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1248 + }, + { + "epoch": 0.9976038338658147, + "grad_norm": 0.06926224380731583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1249 + }, + { + "epoch": 0.9984025559105432, + "grad_norm": 0.0849846750497818, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1250 + }, + { + "epoch": 0.9992012779552716, + "grad_norm": 0.12501482665538788, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1251 + }, + { + "epoch": 1.0, + "grad_norm": 0.1467234194278717, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1252 + }, + { + "epoch": 1.0007987220447285, + "grad_norm": 0.11206725984811783, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1253 + }, + { + "epoch": 1.0015974440894568, + "grad_norm": 0.05224297568202019, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1254 + }, + { + "epoch": 1.0023961661341854, + "grad_norm": 0.15176911652088165, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1255 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.22419261932373047, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1256 + }, + { + "epoch": 1.0039936102236422, + "grad_norm": 0.18444369733333588, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1257 + }, + { + "epoch": 1.0047923322683705, + "grad_norm": 0.06510337442159653, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1258 + }, + { + "epoch": 1.005591054313099, + "grad_norm": 0.16058789193630219, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1259 + }, + { + "epoch": 1.0063897763578276, + "grad_norm": 0.22726313769817352, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1260 + }, + { + "epoch": 1.0071884984025559, + "grad_norm": 0.21050630509853363, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1261 + }, + { + "epoch": 1.0079872204472844, + "grad_norm": 0.09227188676595688, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1262 + }, + { + "epoch": 1.0087859424920127, + "grad_norm": 0.11473584920167923, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 1263 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 0.12692919373512268, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1264 + }, + { + "epoch": 1.0103833865814698, + "grad_norm": 0.056371819227933884, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1265 + }, + { + "epoch": 1.011182108626198, + "grad_norm": 0.13166245818138123, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1266 + }, + { + "epoch": 1.0119808306709266, + "grad_norm": 0.2606523633003235, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1267 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 0.320832759141922, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1268 + }, + { + "epoch": 1.0135782747603834, + "grad_norm": 0.2074427455663681, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1269 + }, + { + "epoch": 1.0143769968051117, + "grad_norm": 0.05768958851695061, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1270 + }, + { + "epoch": 1.0151757188498403, + "grad_norm": 0.08107002079486847, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1271 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 0.12996292114257812, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1272 + }, + { + "epoch": 1.016773162939297, + "grad_norm": 0.1514650285243988, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1273 + }, + { + "epoch": 1.0175718849840256, + "grad_norm": 0.1007395088672638, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1274 + }, + { + "epoch": 1.018370607028754, + "grad_norm": 0.0831306204199791, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1275 + }, + { + "epoch": 1.0191693290734825, + "grad_norm": 0.09004336595535278, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1276 + }, + { + "epoch": 1.0199680511182108, + "grad_norm": 0.06632232666015625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1277 + }, + { + "epoch": 1.0207667731629393, + "grad_norm": 0.05073424428701401, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1278 + }, + { + "epoch": 1.0215654952076678, + "grad_norm": 0.06486333161592484, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 1279 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 0.1137472614645958, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1280 + }, + { + "epoch": 1.0231629392971247, + "grad_norm": 0.08062250912189484, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1281 + }, + { + "epoch": 1.023961661341853, + "grad_norm": 0.05046350136399269, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1282 + }, + { + "epoch": 1.0247603833865815, + "grad_norm": 0.06503880023956299, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1283 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 0.10730332881212234, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1284 + }, + { + "epoch": 1.0263578274760383, + "grad_norm": 0.12077611684799194, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1285 + }, + { + "epoch": 1.0271565495207668, + "grad_norm": 0.15061219036579132, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1286 + }, + { + "epoch": 1.0279552715654952, + "grad_norm": 0.15091058611869812, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1287 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 0.07299874722957611, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1288 + }, + { + "epoch": 1.029552715654952, + "grad_norm": 0.09598413854837418, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1289 + }, + { + "epoch": 1.0303514376996805, + "grad_norm": 0.21661055088043213, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 1290 + }, + { + "epoch": 1.031150159744409, + "grad_norm": 0.24777255952358246, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1291 + }, + { + "epoch": 1.0319488817891374, + "grad_norm": 0.17097236216068268, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1292 + }, + { + "epoch": 1.0327476038338659, + "grad_norm": 0.05266748368740082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1293 + }, + { + "epoch": 1.0335463258785942, + "grad_norm": 0.12484195083379745, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1294 + }, + { + "epoch": 1.0343450479233227, + "grad_norm": 0.1802505999803543, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1295 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 0.10778877139091492, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1296 + }, + { + "epoch": 1.0359424920127795, + "grad_norm": 0.046645063906908035, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1297 + }, + { + "epoch": 1.036741214057508, + "grad_norm": 0.11727745085954666, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1298 + }, + { + "epoch": 1.0375399361022364, + "grad_norm": 0.1356390118598938, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1299 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 0.08130940794944763, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1300 + }, + { + "epoch": 1.0391373801916932, + "grad_norm": 0.07274319976568222, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1301 + }, + { + "epoch": 1.0399361022364217, + "grad_norm": 0.20339541137218475, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1302 + }, + { + "epoch": 1.04073482428115, + "grad_norm": 0.27819424867630005, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 1303 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 0.25879770517349243, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1304 + }, + { + "epoch": 1.042332268370607, + "grad_norm": 0.12683863937854767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1305 + }, + { + "epoch": 1.0431309904153354, + "grad_norm": 0.13531504571437836, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1306 + }, + { + "epoch": 1.043929712460064, + "grad_norm": 0.3203699588775635, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1307 + }, + { + "epoch": 1.0447284345047922, + "grad_norm": 0.3073630630970001, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1308 + }, + { + "epoch": 1.0455271565495208, + "grad_norm": 0.13184015452861786, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1309 + }, + { + "epoch": 1.0463258785942493, + "grad_norm": 0.1311715543270111, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1310 + }, + { + "epoch": 1.0471246006389776, + "grad_norm": 0.24470581114292145, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1311 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 0.21901719272136688, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1312 + }, + { + "epoch": 1.0487220447284344, + "grad_norm": 0.08105460554361343, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1313 + }, + { + "epoch": 1.049520766773163, + "grad_norm": 0.14864705502986908, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1314 + }, + { + "epoch": 1.0503194888178913, + "grad_norm": 0.20006732642650604, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1315 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 0.06233162060379982, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1316 + }, + { + "epoch": 1.0519169329073483, + "grad_norm": 0.12691672146320343, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1317 + }, + { + "epoch": 1.0527156549520766, + "grad_norm": 0.18303292989730835, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1318 + }, + { + "epoch": 1.0535143769968052, + "grad_norm": 0.13289928436279297, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1319 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 0.03847618028521538, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1320 + }, + { + "epoch": 1.055111821086262, + "grad_norm": 0.1317387968301773, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1321 + }, + { + "epoch": 1.0559105431309903, + "grad_norm": 0.1663348227739334, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1322 + }, + { + "epoch": 1.0567092651757188, + "grad_norm": 0.0657038614153862, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1323 + }, + { + "epoch": 1.0575079872204474, + "grad_norm": 0.1484680026769638, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1324 + }, + { + "epoch": 1.0583067092651757, + "grad_norm": 0.299824595451355, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1325 + }, + { + "epoch": 1.0591054313099042, + "grad_norm": 0.3598216772079468, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1326 + }, + { + "epoch": 1.0599041533546325, + "grad_norm": 0.25792455673217773, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1327 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 0.04925544187426567, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1328 + }, + { + "epoch": 1.0615015974440896, + "grad_norm": 0.2568669319152832, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1329 + }, + { + "epoch": 1.0623003194888179, + "grad_norm": 0.2679016590118408, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1330 + }, + { + "epoch": 1.0630990415335464, + "grad_norm": 0.12100119888782501, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1331 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 0.17324721813201904, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1332 + }, + { + "epoch": 1.0646964856230032, + "grad_norm": 0.34452658891677856, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1333 + }, + { + "epoch": 1.0654952076677315, + "grad_norm": 0.24561382830142975, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1334 + }, + { + "epoch": 1.06629392971246, + "grad_norm": 0.06080634891986847, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1335 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 0.249319925904274, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1336 + }, + { + "epoch": 1.067891373801917, + "grad_norm": 0.2586004436016083, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1337 + }, + { + "epoch": 1.0686900958466454, + "grad_norm": 0.07297322154045105, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1338 + }, + { + "epoch": 1.0694888178913737, + "grad_norm": 0.20853886008262634, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1339 + }, + { + "epoch": 1.0702875399361023, + "grad_norm": 0.3214154541492462, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1340 + }, + { + "epoch": 1.0710862619808306, + "grad_norm": 0.16169136762619019, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1341 + }, + { + "epoch": 1.071884984025559, + "grad_norm": 0.18989364802837372, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1342 + }, + { + "epoch": 1.0726837060702876, + "grad_norm": 0.42826735973358154, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1343 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 0.35387369990348816, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1344 + }, + { + "epoch": 1.0742811501597445, + "grad_norm": 0.061617862433195114, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 1345 + }, + { + "epoch": 1.0750798722044728, + "grad_norm": 0.3348129987716675, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1346 + }, + { + "epoch": 1.0758785942492013, + "grad_norm": 0.3622291088104248, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1347 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 0.12743668258190155, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1348 + }, + { + "epoch": 1.0774760383386581, + "grad_norm": 0.2464202642440796, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1349 + }, + { + "epoch": 1.0782747603833867, + "grad_norm": 0.3873802423477173, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1350 + }, + { + "epoch": 1.079073482428115, + "grad_norm": 0.22619839012622833, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1351 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 0.09080081433057785, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 1352 + }, + { + "epoch": 1.0806709265175718, + "grad_norm": 0.31380224227905273, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1353 + }, + { + "epoch": 1.0814696485623003, + "grad_norm": 0.2782067060470581, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1354 + }, + { + "epoch": 1.0822683706070289, + "grad_norm": 0.04267412796616554, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1355 + }, + { + "epoch": 1.0830670926517572, + "grad_norm": 0.2687273919582367, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1356 + }, + { + "epoch": 1.0838658146964857, + "grad_norm": 0.3133341073989868, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1357 + }, + { + "epoch": 1.084664536741214, + "grad_norm": 0.11658725887537003, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1358 + }, + { + "epoch": 1.0854632587859425, + "grad_norm": 0.1339937299489975, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1359 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 0.15727631747722626, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1360 + }, + { + "epoch": 1.0870607028753994, + "grad_norm": 0.11759792268276215, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1361 + }, + { + "epoch": 1.0878594249201279, + "grad_norm": 0.11522746086120605, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1362 + }, + { + "epoch": 1.0886581469648562, + "grad_norm": 0.16571135818958282, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1363 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 0.09467484056949615, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1364 + }, + { + "epoch": 1.090255591054313, + "grad_norm": 0.07887586951255798, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1365 + }, + { + "epoch": 1.0910543130990416, + "grad_norm": 0.11297929286956787, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1366 + }, + { + "epoch": 1.09185303514377, + "grad_norm": 0.06402980536222458, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1367 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 0.11947043240070343, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1368 + }, + { + "epoch": 1.093450479233227, + "grad_norm": 0.06244207173585892, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1369 + }, + { + "epoch": 1.0942492012779552, + "grad_norm": 0.08165531605482101, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1370 + }, + { + "epoch": 1.0950479233226837, + "grad_norm": 0.03842553123831749, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 1371 + }, + { + "epoch": 1.095846645367412, + "grad_norm": 0.12175651639699936, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1372 + }, + { + "epoch": 1.0966453674121406, + "grad_norm": 0.1720212697982788, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1373 + }, + { + "epoch": 1.097444089456869, + "grad_norm": 0.15540143847465515, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1374 + }, + { + "epoch": 1.0982428115015974, + "grad_norm": 0.1056036502122879, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1375 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 0.06738443672657013, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1376 + }, + { + "epoch": 1.0998402555910542, + "grad_norm": 0.09600193798542023, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1377 + }, + { + "epoch": 1.1006389776357828, + "grad_norm": 0.11872005462646484, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1378 + }, + { + "epoch": 1.101437699680511, + "grad_norm": 0.04837389290332794, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1379 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 0.11245802789926529, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1380 + }, + { + "epoch": 1.1030351437699681, + "grad_norm": 0.1525758057832718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1381 + }, + { + "epoch": 1.1038338658146964, + "grad_norm": 0.07688060402870178, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1382 + }, + { + "epoch": 1.104632587859425, + "grad_norm": 0.05793362855911255, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1383 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 0.09737680107355118, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1384 + }, + { + "epoch": 1.1062300319488818, + "grad_norm": 0.15511851012706757, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1385 + }, + { + "epoch": 1.1070287539936103, + "grad_norm": 0.14931945502758026, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1386 + }, + { + "epoch": 1.1078274760383386, + "grad_norm": 0.1451406478881836, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1387 + }, + { + "epoch": 1.1086261980830672, + "grad_norm": 0.06013273820281029, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1388 + }, + { + "epoch": 1.1094249201277955, + "grad_norm": 0.08433987945318222, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1389 + }, + { + "epoch": 1.110223642172524, + "grad_norm": 0.12601709365844727, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1390 + }, + { + "epoch": 1.1110223642172523, + "grad_norm": 0.14611507952213287, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1391 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 0.10526898503303528, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1392 + }, + { + "epoch": 1.1126198083067094, + "grad_norm": 0.03592250496149063, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1393 + }, + { + "epoch": 1.1134185303514377, + "grad_norm": 0.07883994281291962, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1394 + }, + { + "epoch": 1.1142172523961662, + "grad_norm": 0.1351863145828247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1395 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 0.10423804074525833, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1396 + }, + { + "epoch": 1.115814696485623, + "grad_norm": 0.05230586603283882, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1397 + }, + { + "epoch": 1.1166134185303513, + "grad_norm": 0.03962033987045288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1398 + }, + { + "epoch": 1.1174121405750799, + "grad_norm": 0.08950864523649216, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1399 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.1326761394739151, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1400 + }, + { + "epoch": 1.1190095846645367, + "grad_norm": 0.1251986175775528, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1401 + }, + { + "epoch": 1.1198083067092652, + "grad_norm": 0.05831597000360489, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1402 + }, + { + "epoch": 1.1206070287539935, + "grad_norm": 0.11382800340652466, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1403 + }, + { + "epoch": 1.121405750798722, + "grad_norm": 0.16290108859539032, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1404 + }, + { + "epoch": 1.1222044728434506, + "grad_norm": 0.1721554696559906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1405 + }, + { + "epoch": 1.123003194888179, + "grad_norm": 0.09426763653755188, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1406 + }, + { + "epoch": 1.1238019169329074, + "grad_norm": 0.037366580218076706, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1407 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 0.07456237077713013, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1408 + }, + { + "epoch": 1.1253993610223643, + "grad_norm": 0.11701856553554535, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1409 + }, + { + "epoch": 1.1261980830670926, + "grad_norm": 0.13261918723583221, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1410 + }, + { + "epoch": 1.126996805111821, + "grad_norm": 0.09014345705509186, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1411 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 0.05398619920015335, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1412 + }, + { + "epoch": 1.128594249201278, + "grad_norm": 0.09375960379838943, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1413 + }, + { + "epoch": 1.1293929712460065, + "grad_norm": 0.09307628124952316, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1414 + }, + { + "epoch": 1.1301916932907348, + "grad_norm": 0.09488195180892944, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1415 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 0.08067089319229126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1416 + }, + { + "epoch": 1.1317891373801916, + "grad_norm": 0.043899055570364, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1417 + }, + { + "epoch": 1.1325878594249201, + "grad_norm": 0.05593986064195633, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1418 + }, + { + "epoch": 1.1333865814696487, + "grad_norm": 0.05736452341079712, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1419 + }, + { + "epoch": 1.134185303514377, + "grad_norm": 0.1092999204993248, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1420 + }, + { + "epoch": 1.1349840255591055, + "grad_norm": 0.18366938829421997, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1421 + }, + { + "epoch": 1.1357827476038338, + "grad_norm": 0.177176833152771, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1422 + }, + { + "epoch": 1.1365814696485623, + "grad_norm": 0.08829191327095032, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1423 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 0.07169382274150848, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1424 + }, + { + "epoch": 1.1381789137380192, + "grad_norm": 0.130388081073761, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1425 + }, + { + "epoch": 1.1389776357827477, + "grad_norm": 0.20726168155670166, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1426 + }, + { + "epoch": 1.139776357827476, + "grad_norm": 0.21683751046657562, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1427 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 0.131125345826149, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1428 + }, + { + "epoch": 1.1413738019169328, + "grad_norm": 0.04309925064444542, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1429 + }, + { + "epoch": 1.1421725239616614, + "grad_norm": 0.14427928626537323, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1430 + }, + { + "epoch": 1.1429712460063897, + "grad_norm": 0.1743481606245041, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1431 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 0.1037210002541542, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1432 + }, + { + "epoch": 1.1445686900958467, + "grad_norm": 0.11162228137254715, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1433 + }, + { + "epoch": 1.145367412140575, + "grad_norm": 0.25445371866226196, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1434 + }, + { + "epoch": 1.1461661341853036, + "grad_norm": 0.2771884799003601, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1435 + }, + { + "epoch": 1.1469648562300319, + "grad_norm": 0.10653509199619293, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1436 + }, + { + "epoch": 1.1477635782747604, + "grad_norm": 0.1745259016752243, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1437 + }, + { + "epoch": 1.148562300319489, + "grad_norm": 0.3151826560497284, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1438 + }, + { + "epoch": 1.1493610223642172, + "grad_norm": 0.23229722678661346, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1439 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 0.06131701543927193, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1440 + }, + { + "epoch": 1.150958466453674, + "grad_norm": 0.28753313422203064, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1441 + }, + { + "epoch": 1.1517571884984026, + "grad_norm": 0.3178791105747223, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1442 + }, + { + "epoch": 1.1525559105431311, + "grad_norm": 0.10008880496025085, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1443 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 0.2418096512556076, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1444 + }, + { + "epoch": 1.154153354632588, + "grad_norm": 0.34728583693504333, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1445 + }, + { + "epoch": 1.1549520766773163, + "grad_norm": 0.2172212153673172, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1446 + }, + { + "epoch": 1.1557507987220448, + "grad_norm": 0.04184277728199959, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1447 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 0.19960719347000122, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1448 + }, + { + "epoch": 1.1573482428115016, + "grad_norm": 0.19261692464351654, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1449 + }, + { + "epoch": 1.15814696485623, + "grad_norm": 0.08326124399900436, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1450 + }, + { + "epoch": 1.1589456869009584, + "grad_norm": 0.08552456647157669, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1451 + }, + { + "epoch": 1.159744408945687, + "grad_norm": 0.07903868705034256, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 1452 + }, + { + "epoch": 1.1605431309904153, + "grad_norm": 0.045095205307006836, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1453 + }, + { + "epoch": 1.1613418530351438, + "grad_norm": 0.08293266594409943, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1454 + }, + { + "epoch": 1.1621405750798721, + "grad_norm": 0.09431439638137817, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1455 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 0.04189104586839676, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1456 + }, + { + "epoch": 1.1637380191693292, + "grad_norm": 0.11492408066987991, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1457 + }, + { + "epoch": 1.1645367412140575, + "grad_norm": 0.16648449003696442, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1458 + }, + { + "epoch": 1.165335463258786, + "grad_norm": 0.1532576084136963, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1459 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 0.07438737154006958, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1460 + }, + { + "epoch": 1.1669329073482428, + "grad_norm": 0.0887872502207756, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 1461 + }, + { + "epoch": 1.1677316293929714, + "grad_norm": 0.17035096883773804, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1462 + }, + { + "epoch": 1.1685303514376997, + "grad_norm": 0.12702526152133942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1463 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 0.04788994789123535, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1464 + }, + { + "epoch": 1.1701277955271565, + "grad_norm": 0.15093912184238434, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1465 + }, + { + "epoch": 1.170926517571885, + "grad_norm": 0.1428089439868927, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1466 + }, + { + "epoch": 1.1717252396166133, + "grad_norm": 0.039421554654836655, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1467 + }, + { + "epoch": 1.1725239616613419, + "grad_norm": 0.09461840242147446, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1468 + }, + { + "epoch": 1.1733226837060702, + "grad_norm": 0.07272787392139435, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1469 + }, + { + "epoch": 1.1741214057507987, + "grad_norm": 0.10863790661096573, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1470 + }, + { + "epoch": 1.1749201277955272, + "grad_norm": 0.211805522441864, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1471 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 0.2124311476945877, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1472 + }, + { + "epoch": 1.176517571884984, + "grad_norm": 0.14013712108135223, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1473 + }, + { + "epoch": 1.1773162939297124, + "grad_norm": 0.10768178105354309, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1474 + }, + { + "epoch": 1.178115015974441, + "grad_norm": 0.07961699366569519, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1475 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 0.0772516280412674, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1476 + }, + { + "epoch": 1.1797124600638977, + "grad_norm": 0.11957084387540817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1477 + }, + { + "epoch": 1.1805111821086263, + "grad_norm": 0.1976107954978943, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1478 + }, + { + "epoch": 1.1813099041533546, + "grad_norm": 0.20915871858596802, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1479 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 0.10857495665550232, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1480 + }, + { + "epoch": 1.1829073482428114, + "grad_norm": 0.09961260855197906, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1481 + }, + { + "epoch": 1.18370607028754, + "grad_norm": 0.11908663064241409, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1482 + }, + { + "epoch": 1.1845047923322685, + "grad_norm": 0.0982719212770462, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1483 + }, + { + "epoch": 1.1853035143769968, + "grad_norm": 0.05869903787970543, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1484 + }, + { + "epoch": 1.1861022364217253, + "grad_norm": 0.14943145215511322, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1485 + }, + { + "epoch": 1.1869009584664536, + "grad_norm": 0.1761479526758194, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1486 + }, + { + "epoch": 1.1876996805111821, + "grad_norm": 0.1393168866634369, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1487 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 0.0473988801240921, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1488 + }, + { + "epoch": 1.189297124600639, + "grad_norm": 0.20789027214050293, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1489 + }, + { + "epoch": 1.1900958466453675, + "grad_norm": 0.29456260800361633, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1490 + }, + { + "epoch": 1.1908945686900958, + "grad_norm": 0.1875244528055191, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1491 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 0.052052468061447144, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1492 + }, + { + "epoch": 1.1924920127795526, + "grad_norm": 0.1376652717590332, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1493 + }, + { + "epoch": 1.1932907348242812, + "grad_norm": 0.1656588762998581, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1494 + }, + { + "epoch": 1.1940894568690097, + "grad_norm": 0.07063707709312439, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1495 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 0.12681347131729126, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1496 + }, + { + "epoch": 1.1956869009584665, + "grad_norm": 0.17560099065303802, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1497 + }, + { + "epoch": 1.1964856230031948, + "grad_norm": 0.10635025054216385, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1498 + }, + { + "epoch": 1.1972843450479234, + "grad_norm": 0.061567965894937515, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1499 + }, + { + "epoch": 1.1980830670926517, + "grad_norm": 0.12346719950437546, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1500 + }, + { + "epoch": 1.1988817891373802, + "grad_norm": 0.07105513662099838, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1501 + }, + { + "epoch": 1.1996805111821087, + "grad_norm": 0.07719466835260391, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1502 + }, + { + "epoch": 1.200479233226837, + "grad_norm": 0.1478763371706009, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1503 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 0.1383642554283142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1504 + }, + { + "epoch": 1.2020766773162939, + "grad_norm": 0.05519767478108406, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1505 + }, + { + "epoch": 1.2028753993610224, + "grad_norm": 0.06807537376880646, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1506 + }, + { + "epoch": 1.2036741214057507, + "grad_norm": 0.10652226209640503, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1507 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 0.044540517032146454, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1508 + }, + { + "epoch": 1.2052715654952078, + "grad_norm": 0.12266546487808228, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1509 + }, + { + "epoch": 1.206070287539936, + "grad_norm": 0.1997641921043396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1510 + }, + { + "epoch": 1.2068690095846646, + "grad_norm": 0.1924593299627304, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1511 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 0.09990391880273819, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1512 + }, + { + "epoch": 1.2084664536741214, + "grad_norm": 0.04226391762495041, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1513 + }, + { + "epoch": 1.20926517571885, + "grad_norm": 0.07116132974624634, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1514 + }, + { + "epoch": 1.2100638977635783, + "grad_norm": 0.046046894043684006, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1515 + }, + { + "epoch": 1.2108626198083068, + "grad_norm": 0.039608217775821686, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1516 + }, + { + "epoch": 1.211661341853035, + "grad_norm": 0.055937573313713074, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1517 + }, + { + "epoch": 1.2124600638977636, + "grad_norm": 0.09269243478775024, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1518 + }, + { + "epoch": 1.213258785942492, + "grad_norm": 0.04349381849169731, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1519 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 0.08543939888477325, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1520 + }, + { + "epoch": 1.2148562300319488, + "grad_norm": 0.1829536110162735, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1521 + }, + { + "epoch": 1.2156549520766773, + "grad_norm": 0.23422624170780182, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1522 + }, + { + "epoch": 1.2164536741214058, + "grad_norm": 0.13391408324241638, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1523 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 0.07262124121189117, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1524 + }, + { + "epoch": 1.2180511182108626, + "grad_norm": 0.1842898577451706, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1525 + }, + { + "epoch": 1.218849840255591, + "grad_norm": 0.16982080042362213, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1526 + }, + { + "epoch": 1.2196485623003195, + "grad_norm": 0.07628878951072693, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1527 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 0.07903175801038742, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1528 + }, + { + "epoch": 1.2212460063897763, + "grad_norm": 0.1874074637889862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1529 + }, + { + "epoch": 1.2220447284345048, + "grad_norm": 0.2084639072418213, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1530 + }, + { + "epoch": 1.2228434504792332, + "grad_norm": 0.161276176571846, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1531 + }, + { + "epoch": 1.2236421725239617, + "grad_norm": 0.07408371567726135, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1532 + }, + { + "epoch": 1.2244408945686902, + "grad_norm": 0.06918113678693771, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1533 + }, + { + "epoch": 1.2252396166134185, + "grad_norm": 0.15813148021697998, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1534 + }, + { + "epoch": 1.226038338658147, + "grad_norm": 0.1454530507326126, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 1535 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 0.07441768050193787, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1536 + }, + { + "epoch": 1.2276357827476039, + "grad_norm": 0.19151917099952698, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1537 + }, + { + "epoch": 1.2284345047923322, + "grad_norm": 0.22358526289463043, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1538 + }, + { + "epoch": 1.2292332268370607, + "grad_norm": 0.12382426857948303, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1539 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 0.09593929350376129, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1540 + }, + { + "epoch": 1.2308306709265175, + "grad_norm": 0.32887372374534607, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1541 + }, + { + "epoch": 1.231629392971246, + "grad_norm": 0.3910810351371765, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1542 + }, + { + "epoch": 1.2324281150159744, + "grad_norm": 0.21341568231582642, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1543 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 0.10242578387260437, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1544 + }, + { + "epoch": 1.2340255591054312, + "grad_norm": 0.2556541860103607, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 1.2348242811501597, + "grad_norm": 0.22671715915203094, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1546 + }, + { + "epoch": 1.2356230031948883, + "grad_norm": 0.05781029909849167, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1547 + }, + { + "epoch": 1.2364217252396166, + "grad_norm": 0.2803215980529785, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1548 + }, + { + "epoch": 1.237220447284345, + "grad_norm": 0.3391420543193817, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1549 + }, + { + "epoch": 1.2380191693290734, + "grad_norm": 0.17648665606975555, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1550 + }, + { + "epoch": 1.238817891373802, + "grad_norm": 0.14975208044052124, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1551 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 0.2930659353733063, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1552 + }, + { + "epoch": 1.2404153354632588, + "grad_norm": 0.16080376505851746, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1553 + }, + { + "epoch": 1.2412140575079873, + "grad_norm": 0.1765553057193756, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1554 + }, + { + "epoch": 1.2420127795527156, + "grad_norm": 0.43610313534736633, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1555 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 0.3448547124862671, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1556 + }, + { + "epoch": 1.2436102236421724, + "grad_norm": 0.11257574707269669, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1557 + }, + { + "epoch": 1.244408945686901, + "grad_norm": 0.2212686389684677, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1558 + }, + { + "epoch": 1.2452076677316293, + "grad_norm": 0.24576987326145172, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 1559 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 0.07592078298330307, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1560 + }, + { + "epoch": 1.2468051118210863, + "grad_norm": 0.18566438555717468, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1561 + }, + { + "epoch": 1.2476038338658146, + "grad_norm": 0.2345304936170578, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1562 + }, + { + "epoch": 1.2484025559105432, + "grad_norm": 0.12168031930923462, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1563 + }, + { + "epoch": 1.2492012779552715, + "grad_norm": 0.10168169438838959, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1564 + }, + { + "epoch": 1.25, + "grad_norm": 0.14832071959972382, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1565 + }, + { + "epoch": 1.2507987220447285, + "grad_norm": 0.04516097158193588, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1566 + }, + { + "epoch": 1.2515974440894568, + "grad_norm": 0.14377422630786896, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1567 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 0.12483170628547668, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1568 + }, + { + "epoch": 1.2531948881789137, + "grad_norm": 0.06861971318721771, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1569 + }, + { + "epoch": 1.2539936102236422, + "grad_norm": 0.1124153807759285, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1570 + }, + { + "epoch": 1.2547923322683707, + "grad_norm": 0.16883404552936554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1571 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 0.09533397108316422, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1572 + }, + { + "epoch": 1.2563897763578276, + "grad_norm": 0.09215923398733139, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1573 + }, + { + "epoch": 1.2571884984025559, + "grad_norm": 0.12701599299907684, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1574 + }, + { + "epoch": 1.2579872204472844, + "grad_norm": 0.09106232225894928, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1575 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 0.047954440116882324, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1576 + }, + { + "epoch": 1.2595846645367412, + "grad_norm": 0.13917528092861176, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1577 + }, + { + "epoch": 1.2603833865814695, + "grad_norm": 0.17694029211997986, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1578 + }, + { + "epoch": 1.261182108626198, + "grad_norm": 0.11021065711975098, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1579 + }, + { + "epoch": 1.2619808306709266, + "grad_norm": 0.03982831537723541, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1580 + }, + { + "epoch": 1.262779552715655, + "grad_norm": 0.08759493380784988, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1581 + }, + { + "epoch": 1.2635782747603834, + "grad_norm": 0.04797520861029625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1582 + }, + { + "epoch": 1.2643769968051117, + "grad_norm": 0.049942485988140106, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 1583 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 0.04236803576350212, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1584 + }, + { + "epoch": 1.2659744408945688, + "grad_norm": 0.05938104912638664, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1585 + }, + { + "epoch": 1.266773162939297, + "grad_norm": 0.07487885653972626, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1586 + }, + { + "epoch": 1.2675718849840256, + "grad_norm": 0.063072569668293, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1587 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 0.07140504568815231, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1588 + }, + { + "epoch": 1.2691693290734825, + "grad_norm": 0.04790132865309715, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1589 + }, + { + "epoch": 1.269968051118211, + "grad_norm": 0.050013668835163116, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1590 + }, + { + "epoch": 1.2707667731629393, + "grad_norm": 0.0559731163084507, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1591 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 0.04633013904094696, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1592 + }, + { + "epoch": 1.2723642172523961, + "grad_norm": 0.05252271518111229, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1593 + }, + { + "epoch": 1.2731629392971247, + "grad_norm": 0.0902840718626976, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1594 + }, + { + "epoch": 1.273961661341853, + "grad_norm": 0.07961871474981308, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1595 + }, + { + "epoch": 1.2747603833865815, + "grad_norm": 0.07653608173131943, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1596 + }, + { + "epoch": 1.2755591054313098, + "grad_norm": 0.15634121000766754, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1597 + }, + { + "epoch": 1.2763578274760383, + "grad_norm": 0.2045222818851471, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1598 + }, + { + "epoch": 1.2771565495207668, + "grad_norm": 0.1769608110189438, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1599 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.09675133973360062, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1600 + }, + { + "epoch": 1.2787539936102237, + "grad_norm": 0.055832285434007645, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1601 + }, + { + "epoch": 1.279552715654952, + "grad_norm": 0.09108291566371918, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1602 + }, + { + "epoch": 1.2803514376996805, + "grad_norm": 0.10872901976108551, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1603 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 0.08771848678588867, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1604 + }, + { + "epoch": 1.2819488817891374, + "grad_norm": 0.0731026753783226, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1605 + }, + { + "epoch": 1.2827476038338659, + "grad_norm": 0.040664345026016235, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1606 + }, + { + "epoch": 1.2835463258785942, + "grad_norm": 0.06111081317067146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1607 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 0.08753795176744461, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1608 + }, + { + "epoch": 1.2851437699680512, + "grad_norm": 0.07113729417324066, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1609 + }, + { + "epoch": 1.2859424920127795, + "grad_norm": 0.05469372868537903, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1610 + }, + { + "epoch": 1.2867412140575079, + "grad_norm": 0.05748649686574936, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1611 + }, + { + "epoch": 1.2875399361022364, + "grad_norm": 0.05832446366548538, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1612 + }, + { + "epoch": 1.288338658146965, + "grad_norm": 0.06085522472858429, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1613 + }, + { + "epoch": 1.2891373801916932, + "grad_norm": 0.08154775947332382, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1614 + }, + { + "epoch": 1.2899361022364217, + "grad_norm": 0.11568816751241684, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1615 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 0.06356564909219742, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1616 + }, + { + "epoch": 1.2915335463258786, + "grad_norm": 0.08187399804592133, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1617 + }, + { + "epoch": 1.292332268370607, + "grad_norm": 0.05326744168996811, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1618 + }, + { + "epoch": 1.2931309904153354, + "grad_norm": 0.05407040938735008, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1619 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 0.07292867451906204, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 1620 + }, + { + "epoch": 1.2947284345047922, + "grad_norm": 0.09447437524795532, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1621 + }, + { + "epoch": 1.2955271565495208, + "grad_norm": 0.0592079721391201, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1622 + }, + { + "epoch": 1.2963258785942493, + "grad_norm": 0.052008479833602905, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1623 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 0.06381972879171371, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1624 + }, + { + "epoch": 1.2979233226837061, + "grad_norm": 0.07434900850057602, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1625 + }, + { + "epoch": 1.2987220447284344, + "grad_norm": 0.06477486342191696, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1626 + }, + { + "epoch": 1.299520766773163, + "grad_norm": 0.13730554282665253, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1627 + }, + { + "epoch": 1.3003194888178915, + "grad_norm": 0.1683935821056366, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1628 + }, + { + "epoch": 1.3011182108626198, + "grad_norm": 0.08616848289966583, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1629 + }, + { + "epoch": 1.3019169329073481, + "grad_norm": 0.10220590978860855, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1630 + }, + { + "epoch": 1.3027156549520766, + "grad_norm": 0.22036917507648468, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1631 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.2277965545654297, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1632 + }, + { + "epoch": 1.3043130990415335, + "grad_norm": 0.10426606982946396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1633 + }, + { + "epoch": 1.305111821086262, + "grad_norm": 0.06641022861003876, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1634 + }, + { + "epoch": 1.3059105431309903, + "grad_norm": 0.09100072830915451, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1635 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 0.06551069766283035, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1636 + }, + { + "epoch": 1.3075079872204474, + "grad_norm": 0.04397547245025635, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1637 + }, + { + "epoch": 1.3083067092651757, + "grad_norm": 0.0781746581196785, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1638 + }, + { + "epoch": 1.3091054313099042, + "grad_norm": 0.07852843403816223, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1639 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 0.09224545955657959, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1640 + }, + { + "epoch": 1.310702875399361, + "grad_norm": 0.10179189592599869, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1641 + }, + { + "epoch": 1.3115015974440896, + "grad_norm": 0.07562009245157242, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1642 + }, + { + "epoch": 1.3123003194888179, + "grad_norm": 0.15463820099830627, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1643 + }, + { + "epoch": 1.3130990415335464, + "grad_norm": 0.05742334946990013, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 1644 + }, + { + "epoch": 1.3138977635782747, + "grad_norm": 0.09010195732116699, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1645 + }, + { + "epoch": 1.3146964856230032, + "grad_norm": 0.04284297674894333, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1646 + }, + { + "epoch": 1.3154952076677318, + "grad_norm": 0.07167239487171173, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1647 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 0.04978404566645622, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1648 + }, + { + "epoch": 1.3170926517571884, + "grad_norm": 0.2888668477535248, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1649 + }, + { + "epoch": 1.317891373801917, + "grad_norm": 0.13716880977153778, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1650 + }, + { + "epoch": 1.3186900958466454, + "grad_norm": 0.13081762194633484, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1651 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 0.046977054327726364, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1652 + }, + { + "epoch": 1.3202875399361023, + "grad_norm": 0.1331615000963211, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 1653 + }, + { + "epoch": 1.3210862619808306, + "grad_norm": 0.21066126227378845, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 1654 + }, + { + "epoch": 1.321884984025559, + "grad_norm": 0.23017194867134094, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1655 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 0.20224629342556, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1656 + }, + { + "epoch": 1.323482428115016, + "grad_norm": 0.09836700558662415, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1657 + }, + { + "epoch": 1.3242811501597445, + "grad_norm": 0.10621663928031921, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1658 + }, + { + "epoch": 1.3250798722044728, + "grad_norm": 0.25464868545532227, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1659 + }, + { + "epoch": 1.3258785942492013, + "grad_norm": 0.39965251088142395, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1660 + }, + { + "epoch": 1.3266773162939298, + "grad_norm": 0.4731796383857727, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1661 + }, + { + "epoch": 1.3274760383386581, + "grad_norm": 0.4287014603614807, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1662 + }, + { + "epoch": 1.3282747603833867, + "grad_norm": 0.15660974383354187, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1663 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.14340882003307343, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1664 + }, + { + "epoch": 1.3298722044728435, + "grad_norm": 0.23041795194149017, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1665 + }, + { + "epoch": 1.330670926517572, + "grad_norm": 0.14607569575309753, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1666 + }, + { + "epoch": 1.3314696485623003, + "grad_norm": 0.0620175264775753, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1667 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 0.1722227782011032, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1668 + }, + { + "epoch": 1.3330670926517572, + "grad_norm": 0.17676329612731934, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1669 + }, + { + "epoch": 1.3338658146964857, + "grad_norm": 0.10175948590040207, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1670 + }, + { + "epoch": 1.334664536741214, + "grad_norm": 0.052259646356105804, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1671 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 0.11740414053201675, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1672 + }, + { + "epoch": 1.3362619808306708, + "grad_norm": 0.13614653050899506, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1673 + }, + { + "epoch": 1.3370607028753994, + "grad_norm": 0.12058388441801071, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 1674 + }, + { + "epoch": 1.3378594249201279, + "grad_norm": 0.12473122030496597, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1675 + }, + { + "epoch": 1.3386581469648562, + "grad_norm": 0.11198705434799194, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1676 + }, + { + "epoch": 1.3394568690095847, + "grad_norm": 0.06745828688144684, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1677 + }, + { + "epoch": 1.340255591054313, + "grad_norm": 0.06042877584695816, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1678 + }, + { + "epoch": 1.3410543130990416, + "grad_norm": 0.08762289583683014, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1679 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 0.07612926512956619, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1680 + }, + { + "epoch": 1.3426517571884984, + "grad_norm": 0.16108228266239166, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1681 + }, + { + "epoch": 1.343450479233227, + "grad_norm": 0.12803438305854797, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1682 + }, + { + "epoch": 1.3442492012779552, + "grad_norm": 0.09190207719802856, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1683 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 0.07201807200908661, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1684 + }, + { + "epoch": 1.3458466453674123, + "grad_norm": 0.06885793805122375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1685 + }, + { + "epoch": 1.3466453674121406, + "grad_norm": 0.06998719274997711, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1686 + }, + { + "epoch": 1.3474440894568689, + "grad_norm": 0.08072122186422348, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1687 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 0.1314389705657959, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1688 + }, + { + "epoch": 1.349041533546326, + "grad_norm": 0.1393643617630005, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1689 + }, + { + "epoch": 1.3498402555910542, + "grad_norm": 0.1482846736907959, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1690 + }, + { + "epoch": 1.3506389776357828, + "grad_norm": 0.10097873955965042, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1691 + }, + { + "epoch": 1.351437699680511, + "grad_norm": 0.16020123660564423, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1692 + }, + { + "epoch": 1.3522364217252396, + "grad_norm": 0.4032374322414398, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1693 + }, + { + "epoch": 1.3530351437699681, + "grad_norm": 0.21653197705745697, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1694 + }, + { + "epoch": 1.3538338658146964, + "grad_norm": 0.18634478747844696, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1695 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 0.06293921917676926, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1696 + }, + { + "epoch": 1.3554313099041533, + "grad_norm": 0.09862471371889114, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1697 + }, + { + "epoch": 1.3562300319488818, + "grad_norm": 0.17562821507453918, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1698 + }, + { + "epoch": 1.3570287539936103, + "grad_norm": 0.17277459800243378, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1699 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 0.06883158534765244, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1700 + }, + { + "epoch": 1.3586261980830672, + "grad_norm": 0.06487718969583511, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1701 + }, + { + "epoch": 1.3594249201277955, + "grad_norm": 0.08988886326551437, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1702 + }, + { + "epoch": 1.360223642172524, + "grad_norm": 0.05164919048547745, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1703 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 0.143778458237648, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1704 + }, + { + "epoch": 1.3618210862619808, + "grad_norm": 0.21736390888690948, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1705 + }, + { + "epoch": 1.3626198083067091, + "grad_norm": 0.2496086061000824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1706 + }, + { + "epoch": 1.3634185303514377, + "grad_norm": 0.21299317479133606, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1707 + }, + { + "epoch": 1.3642172523961662, + "grad_norm": 0.06845723092556, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1708 + }, + { + "epoch": 1.3650159744408945, + "grad_norm": 0.14018614590168, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1709 + }, + { + "epoch": 1.365814696485623, + "grad_norm": 0.1971539407968521, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1710 + }, + { + "epoch": 1.3666134185303513, + "grad_norm": 0.10819724202156067, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 1711 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 0.12900666892528534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1712 + }, + { + "epoch": 1.3682108626198084, + "grad_norm": 0.17080886662006378, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1713 + }, + { + "epoch": 1.3690095846645367, + "grad_norm": 0.22689902782440186, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1714 + }, + { + "epoch": 1.3698083067092652, + "grad_norm": 0.2200036197900772, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1715 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 0.15193268656730652, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1716 + }, + { + "epoch": 1.371405750798722, + "grad_norm": 0.057297177612781525, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1717 + }, + { + "epoch": 1.3722044728434506, + "grad_norm": 0.12024576961994171, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1718 + }, + { + "epoch": 1.373003194888179, + "grad_norm": 0.16183575987815857, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1719 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 0.14740106463432312, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 1720 + }, + { + "epoch": 1.3746006389776357, + "grad_norm": 0.09009548276662827, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1721 + }, + { + "epoch": 1.3753993610223643, + "grad_norm": 0.05091484636068344, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1722 + }, + { + "epoch": 1.3761980830670926, + "grad_norm": 0.05887647345662117, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1723 + }, + { + "epoch": 1.376996805111821, + "grad_norm": 0.06313642859458923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1724 + }, + { + "epoch": 1.3777955271565494, + "grad_norm": 0.06496263295412064, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1725 + }, + { + "epoch": 1.378594249201278, + "grad_norm": 0.06047922000288963, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1726 + }, + { + "epoch": 1.3793929712460065, + "grad_norm": 0.05579136312007904, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 1727 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 0.05931869521737099, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1728 + }, + { + "epoch": 1.3809904153354633, + "grad_norm": 0.049043234437704086, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1729 + }, + { + "epoch": 1.3817891373801916, + "grad_norm": 0.051883842796087265, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1730 + }, + { + "epoch": 1.3825878594249201, + "grad_norm": 0.07195441424846649, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1731 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 0.12339463829994202, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1732 + }, + { + "epoch": 1.384185303514377, + "grad_norm": 0.16951170563697815, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1733 + }, + { + "epoch": 1.3849840255591055, + "grad_norm": 0.1773078590631485, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1734 + }, + { + "epoch": 1.3857827476038338, + "grad_norm": 0.15160880982875824, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1735 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 0.12933489680290222, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1736 + }, + { + "epoch": 1.3873801916932909, + "grad_norm": 0.05910791456699371, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1737 + }, + { + "epoch": 1.3881789137380192, + "grad_norm": 0.06765501946210861, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1738 + }, + { + "epoch": 1.3889776357827475, + "grad_norm": 0.09179043024778366, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1739 + }, + { + "epoch": 1.389776357827476, + "grad_norm": 0.08842387795448303, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 1740 + }, + { + "epoch": 1.3905750798722045, + "grad_norm": 0.07700884342193604, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1741 + }, + { + "epoch": 1.3913738019169328, + "grad_norm": 0.045392196625471115, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1742 + }, + { + "epoch": 1.3921725239616614, + "grad_norm": 0.11977320909500122, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1743 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 0.1882479041814804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1744 + }, + { + "epoch": 1.3937699680511182, + "grad_norm": 0.25021475553512573, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1745 + }, + { + "epoch": 1.3945686900958467, + "grad_norm": 0.23374556005001068, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1746 + }, + { + "epoch": 1.395367412140575, + "grad_norm": 0.1016339659690857, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1747 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 0.1340985745191574, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1748 + }, + { + "epoch": 1.3969648562300319, + "grad_norm": 0.21048963069915771, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1749 + }, + { + "epoch": 1.3977635782747604, + "grad_norm": 0.20711666345596313, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1750 + }, + { + "epoch": 1.398562300319489, + "grad_norm": 0.19101384282112122, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1751 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 0.17655788362026215, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1752 + }, + { + "epoch": 1.4001597444089458, + "grad_norm": 0.11994078010320663, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1753 + }, + { + "epoch": 1.400958466453674, + "grad_norm": 0.09805315732955933, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1754 + }, + { + "epoch": 1.4017571884984026, + "grad_norm": 0.07474519312381744, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1755 + }, + { + "epoch": 1.4025559105431311, + "grad_norm": 0.11269772797822952, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1756 + }, + { + "epoch": 1.4033546325878594, + "grad_norm": 0.08900775015354156, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1757 + }, + { + "epoch": 1.4041533546325877, + "grad_norm": 0.05614674836397171, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1758 + }, + { + "epoch": 1.4049520766773163, + "grad_norm": 0.12895621359348297, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1759 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 0.16433797776699066, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1760 + }, + { + "epoch": 1.406549520766773, + "grad_norm": 0.20009422302246094, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1761 + }, + { + "epoch": 1.4073482428115016, + "grad_norm": 0.146495059132576, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1762 + }, + { + "epoch": 1.40814696485623, + "grad_norm": 0.07518120110034943, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1763 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 0.09864111244678497, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1764 + }, + { + "epoch": 1.409744408945687, + "grad_norm": 0.20213425159454346, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 1765 + }, + { + "epoch": 1.4105431309904153, + "grad_norm": 0.17369656264781952, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1766 + }, + { + "epoch": 1.4113418530351438, + "grad_norm": 0.06627536565065384, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1767 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 0.09098218381404877, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1768 + }, + { + "epoch": 1.4129392971246006, + "grad_norm": 0.11730248481035233, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1769 + }, + { + "epoch": 1.4137380191693292, + "grad_norm": 0.07061973959207535, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1770 + }, + { + "epoch": 1.4145367412140575, + "grad_norm": 0.10279946774244308, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1771 + }, + { + "epoch": 1.415335463258786, + "grad_norm": 0.18082919716835022, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1772 + }, + { + "epoch": 1.4161341853035143, + "grad_norm": 0.1592867076396942, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1773 + }, + { + "epoch": 1.4169329073482428, + "grad_norm": 0.09976492077112198, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1774 + }, + { + "epoch": 1.4177316293929714, + "grad_norm": 0.060737378895282745, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1775 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 0.06248186528682709, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1776 + }, + { + "epoch": 1.419329073482428, + "grad_norm": 0.13300968706607819, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1777 + }, + { + "epoch": 1.4201277955271565, + "grad_norm": 0.1979697346687317, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1778 + }, + { + "epoch": 1.420926517571885, + "grad_norm": 0.23268306255340576, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1779 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 0.18313626945018768, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1780 + }, + { + "epoch": 1.4225239616613419, + "grad_norm": 0.08110051602125168, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1781 + }, + { + "epoch": 1.4233226837060702, + "grad_norm": 0.09732743352651596, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1782 + }, + { + "epoch": 1.4241214057507987, + "grad_norm": 0.1656067669391632, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1783 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 0.1959427297115326, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1784 + }, + { + "epoch": 1.4257188498402555, + "grad_norm": 0.17609809339046478, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1785 + }, + { + "epoch": 1.426517571884984, + "grad_norm": 0.0999840646982193, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1786 + }, + { + "epoch": 1.4273162939297124, + "grad_norm": 0.06475909799337387, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1787 + }, + { + "epoch": 1.428115015974441, + "grad_norm": 0.1364496946334839, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1788 + }, + { + "epoch": 1.4289137380191694, + "grad_norm": 0.21113638579845428, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 1789 + }, + { + "epoch": 1.4297124600638977, + "grad_norm": 0.25998085737228394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1790 + }, + { + "epoch": 1.4305111821086263, + "grad_norm": 0.24930700659751892, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1791 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 0.131307452917099, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1792 + }, + { + "epoch": 1.432108626198083, + "grad_norm": 0.0739457756280899, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1793 + }, + { + "epoch": 1.4329073482428116, + "grad_norm": 0.2009744644165039, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1794 + }, + { + "epoch": 1.43370607028754, + "grad_norm": 0.28875023126602173, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1795 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 0.25421038269996643, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1796 + }, + { + "epoch": 1.4353035143769968, + "grad_norm": 0.09670932590961456, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1797 + }, + { + "epoch": 1.4361022364217253, + "grad_norm": 0.11264955252408981, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1798 + }, + { + "epoch": 1.4369009584664536, + "grad_norm": 0.1401909440755844, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1799 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 0.08234099298715591, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1800 + }, + { + "epoch": 1.4384984025559104, + "grad_norm": 0.05028436705470085, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1801 + }, + { + "epoch": 1.439297124600639, + "grad_norm": 0.04673704132437706, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1802 + }, + { + "epoch": 1.4400958466453675, + "grad_norm": 0.07369101047515869, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 1803 + }, + { + "epoch": 1.4408945686900958, + "grad_norm": 0.161424919962883, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1804 + }, + { + "epoch": 1.4416932907348243, + "grad_norm": 0.13576306402683258, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1805 + }, + { + "epoch": 1.4424920127795526, + "grad_norm": 0.063505619764328, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 1806 + }, + { + "epoch": 1.4432907348242812, + "grad_norm": 0.07231617718935013, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1807 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 0.1698617935180664, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1808 + }, + { + "epoch": 1.444888178913738, + "grad_norm": 0.16520395874977112, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1809 + }, + { + "epoch": 1.4456869009584665, + "grad_norm": 0.058485522866249084, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1810 + }, + { + "epoch": 1.4464856230031948, + "grad_norm": 0.0816773921251297, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1811 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 0.15307661890983582, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1812 + }, + { + "epoch": 1.4480830670926519, + "grad_norm": 0.20710408687591553, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 1813 + }, + { + "epoch": 1.4488817891373802, + "grad_norm": 0.1786869764328003, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1814 + }, + { + "epoch": 1.4496805111821085, + "grad_norm": 0.07363469898700714, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1815 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 0.10158272087574005, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1816 + }, + { + "epoch": 1.4512779552715656, + "grad_norm": 0.14304493367671967, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1817 + }, + { + "epoch": 1.4520766773162939, + "grad_norm": 0.11782495677471161, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1818 + }, + { + "epoch": 1.4528753993610224, + "grad_norm": 0.09340433776378632, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1819 + }, + { + "epoch": 1.4536741214057507, + "grad_norm": 0.08881603926420212, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1820 + }, + { + "epoch": 1.4544728434504792, + "grad_norm": 0.1377323865890503, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1821 + }, + { + "epoch": 1.4552715654952078, + "grad_norm": 0.1137915700674057, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1822 + }, + { + "epoch": 1.456070287539936, + "grad_norm": 0.08219580352306366, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1823 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 0.048282165080308914, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 1824 + }, + { + "epoch": 1.457667731629393, + "grad_norm": 0.07061316817998886, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1825 + }, + { + "epoch": 1.4584664536741214, + "grad_norm": 0.09383007138967514, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1826 + }, + { + "epoch": 1.45926517571885, + "grad_norm": 0.10688310861587524, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1827 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 0.09751323610544205, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 1828 + }, + { + "epoch": 1.4608626198083068, + "grad_norm": 0.10437846183776855, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1829 + }, + { + "epoch": 1.461661341853035, + "grad_norm": 0.13903124630451202, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1830 + }, + { + "epoch": 1.4624600638977636, + "grad_norm": 0.09480495005846024, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1831 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 0.062304843217134476, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1832 + }, + { + "epoch": 1.4640575079872205, + "grad_norm": 0.13482356071472168, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1833 + }, + { + "epoch": 1.4648562300319488, + "grad_norm": 0.2302182912826538, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1834 + }, + { + "epoch": 1.4656549520766773, + "grad_norm": 0.28565964102745056, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1835 + }, + { + "epoch": 1.4664536741214058, + "grad_norm": 0.28437626361846924, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1836 + }, + { + "epoch": 1.4672523961661341, + "grad_norm": 0.20637334883213043, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1837 + }, + { + "epoch": 1.4680511182108626, + "grad_norm": 0.08829299360513687, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1838 + }, + { + "epoch": 1.468849840255591, + "grad_norm": 0.06338132172822952, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 1839 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 0.13094602525234222, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1840 + }, + { + "epoch": 1.470447284345048, + "grad_norm": 0.15911467373371124, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1841 + }, + { + "epoch": 1.4712460063897763, + "grad_norm": 0.10913829505443573, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 1842 + }, + { + "epoch": 1.4720447284345048, + "grad_norm": 0.06934744864702225, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1843 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.07930968701839447, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1844 + }, + { + "epoch": 1.4736421725239617, + "grad_norm": 0.11225491017103195, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1845 + }, + { + "epoch": 1.4744408945686902, + "grad_norm": 0.12815739214420319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1846 + }, + { + "epoch": 1.4752396166134185, + "grad_norm": 0.0943179577589035, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1847 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 0.051353566348552704, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1848 + }, + { + "epoch": 1.4768370607028753, + "grad_norm": 0.10284367203712463, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1849 + }, + { + "epoch": 1.4776357827476039, + "grad_norm": 0.18345551192760468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1850 + }, + { + "epoch": 1.4784345047923324, + "grad_norm": 0.19532762467861176, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1851 + }, + { + "epoch": 1.4792332268370607, + "grad_norm": 0.12518467009067535, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1852 + }, + { + "epoch": 1.480031948881789, + "grad_norm": 0.05363085865974426, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1853 + }, + { + "epoch": 1.4808306709265175, + "grad_norm": 0.18222568929195404, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 1854 + }, + { + "epoch": 1.481629392971246, + "grad_norm": 0.19992542266845703, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1855 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 0.1724570095539093, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1856 + }, + { + "epoch": 1.483226837060703, + "grad_norm": 0.04096012935042381, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1857 + }, + { + "epoch": 1.4840255591054312, + "grad_norm": 0.15409474074840546, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1858 + }, + { + "epoch": 1.4848242811501597, + "grad_norm": 0.29238876700401306, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1859 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 0.35619401931762695, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1860 + }, + { + "epoch": 1.4864217252396166, + "grad_norm": 0.2790282964706421, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1861 + }, + { + "epoch": 1.487220447284345, + "grad_norm": 0.0809629037976265, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1862 + }, + { + "epoch": 1.4880191693290734, + "grad_norm": 0.1827513724565506, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1863 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 0.2284395545721054, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1864 + }, + { + "epoch": 1.4896166134185305, + "grad_norm": 0.11697912216186523, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1865 + }, + { + "epoch": 1.4904153354632588, + "grad_norm": 0.08668534457683563, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 1866 + }, + { + "epoch": 1.4912140575079873, + "grad_norm": 0.19793611764907837, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1867 + }, + { + "epoch": 1.4920127795527156, + "grad_norm": 0.18775872886180878, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1868 + }, + { + "epoch": 1.4928115015974441, + "grad_norm": 0.07068412005901337, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1869 + }, + { + "epoch": 1.4936102236421724, + "grad_norm": 0.07640416920185089, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1870 + }, + { + "epoch": 1.494408945686901, + "grad_norm": 0.1333264708518982, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1871 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 0.13000380992889404, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1872 + }, + { + "epoch": 1.4960063897763578, + "grad_norm": 0.05382491648197174, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1873 + }, + { + "epoch": 1.4968051118210863, + "grad_norm": 0.12773285806179047, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1874 + }, + { + "epoch": 1.4976038338658146, + "grad_norm": 0.2441176027059555, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1875 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 0.26628851890563965, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1876 + }, + { + "epoch": 1.4992012779552715, + "grad_norm": 0.1295953392982483, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1877 + }, + { + "epoch": 1.5, + "grad_norm": 0.10860511660575867, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1878 + }, + { + "epoch": 1.5007987220447285, + "grad_norm": 0.25177180767059326, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1879 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 0.2379150688648224, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1880 + }, + { + "epoch": 1.5023961661341851, + "grad_norm": 0.101965993642807, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1881 + }, + { + "epoch": 1.5031948881789137, + "grad_norm": 0.15633052587509155, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1882 + }, + { + "epoch": 1.5039936102236422, + "grad_norm": 0.3071416914463043, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1883 + }, + { + "epoch": 1.5047923322683707, + "grad_norm": 0.2126736044883728, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1884 + }, + { + "epoch": 1.505591054313099, + "grad_norm": 0.05252298340201378, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1885 + }, + { + "epoch": 1.5063897763578273, + "grad_norm": 0.23854316771030426, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1886 + }, + { + "epoch": 1.5071884984025559, + "grad_norm": 0.305148720741272, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1887 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 0.1371227502822876, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1888 + }, + { + "epoch": 1.508785942492013, + "grad_norm": 0.16433516144752502, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1889 + }, + { + "epoch": 1.5095846645367412, + "grad_norm": 0.24010877311229706, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1890 + }, + { + "epoch": 1.5103833865814695, + "grad_norm": 0.12839943170547485, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1891 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 0.055945366621017456, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1892 + }, + { + "epoch": 1.5119808306709266, + "grad_norm": 0.16645023226737976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1893 + }, + { + "epoch": 1.5127795527156551, + "grad_norm": 0.14626996219158173, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1894 + }, + { + "epoch": 1.5135782747603834, + "grad_norm": 0.04274629056453705, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1895 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 0.10497253388166428, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1896 + }, + { + "epoch": 1.5151757188498403, + "grad_norm": 0.159364715218544, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 1897 + }, + { + "epoch": 1.5159744408945688, + "grad_norm": 0.11409968137741089, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1898 + }, + { + "epoch": 1.516773162939297, + "grad_norm": 0.03989424183964729, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1899 + }, + { + "epoch": 1.5175718849840254, + "grad_norm": 0.12703374028205872, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1900 + }, + { + "epoch": 1.518370607028754, + "grad_norm": 0.20534875988960266, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1901 + }, + { + "epoch": 1.5191693290734825, + "grad_norm": 0.2276938110589981, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1902 + }, + { + "epoch": 1.519968051118211, + "grad_norm": 0.114278644323349, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1903 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 0.08295118063688278, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 1904 + }, + { + "epoch": 1.5215654952076676, + "grad_norm": 0.18610796332359314, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1905 + }, + { + "epoch": 1.5223642172523961, + "grad_norm": 0.1920524388551712, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1906 + }, + { + "epoch": 1.5231629392971247, + "grad_norm": 0.06447675824165344, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1907 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 0.17821159958839417, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1908 + }, + { + "epoch": 1.5247603833865815, + "grad_norm": 0.23894363641738892, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1909 + }, + { + "epoch": 1.5255591054313098, + "grad_norm": 0.14711391925811768, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1910 + }, + { + "epoch": 1.5263578274760383, + "grad_norm": 0.07863837480545044, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1911 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 0.20990678668022156, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 1912 + }, + { + "epoch": 1.5279552715654952, + "grad_norm": 0.19979886710643768, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1913 + }, + { + "epoch": 1.5287539936102237, + "grad_norm": 0.0871618464589119, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1914 + }, + { + "epoch": 1.529552715654952, + "grad_norm": 0.09294576942920685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1915 + }, + { + "epoch": 1.5303514376996805, + "grad_norm": 0.23010258376598358, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1916 + }, + { + "epoch": 1.531150159744409, + "grad_norm": 0.2919708788394928, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 1917 + }, + { + "epoch": 1.5319488817891374, + "grad_norm": 0.21767428517341614, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1918 + }, + { + "epoch": 1.5327476038338657, + "grad_norm": 0.07844182848930359, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1919 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 0.14891114830970764, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1920 + }, + { + "epoch": 1.5343450479233227, + "grad_norm": 0.17959977686405182, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1921 + }, + { + "epoch": 1.5351437699680512, + "grad_norm": 0.10217028856277466, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1922 + }, + { + "epoch": 1.5359424920127795, + "grad_norm": 0.08135818690061569, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1923 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 0.19660547375679016, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1924 + }, + { + "epoch": 1.5375399361022364, + "grad_norm": 0.2106354534626007, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1925 + }, + { + "epoch": 1.538338658146965, + "grad_norm": 0.11042182147502899, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1926 + }, + { + "epoch": 1.5391373801916934, + "grad_norm": 0.08777181059122086, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1927 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 0.18283812701702118, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1928 + }, + { + "epoch": 1.54073482428115, + "grad_norm": 0.11731691658496857, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1929 + }, + { + "epoch": 1.5415335463258786, + "grad_norm": 0.04163304716348648, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1930 + }, + { + "epoch": 1.542332268370607, + "grad_norm": 0.12119868397712708, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1931 + }, + { + "epoch": 1.5431309904153354, + "grad_norm": 0.18475785851478577, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1932 + }, + { + "epoch": 1.543929712460064, + "grad_norm": 0.16582897305488586, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1933 + }, + { + "epoch": 1.5447284345047922, + "grad_norm": 0.086383156478405, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1934 + }, + { + "epoch": 1.5455271565495208, + "grad_norm": 0.047143738716840744, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1935 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 0.0830119326710701, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1936 + }, + { + "epoch": 1.5471246006389776, + "grad_norm": 0.14226214587688446, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1937 + }, + { + "epoch": 1.547923322683706, + "grad_norm": 0.1719929724931717, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1938 + }, + { + "epoch": 1.5487220447284344, + "grad_norm": 0.18388192355632782, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1939 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 0.16870245337486267, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1940 + }, + { + "epoch": 1.5503194888178915, + "grad_norm": 0.1100412905216217, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1941 + }, + { + "epoch": 1.5511182108626198, + "grad_norm": 0.05124165490269661, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1942 + }, + { + "epoch": 1.5519169329073481, + "grad_norm": 0.08937443792819977, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1943 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 0.13589949905872345, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1944 + }, + { + "epoch": 1.5535143769968052, + "grad_norm": 0.12346407026052475, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1945 + }, + { + "epoch": 1.5543130990415337, + "grad_norm": 0.11836438626050949, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1946 + }, + { + "epoch": 1.555111821086262, + "grad_norm": 0.07569031417369843, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1947 + }, + { + "epoch": 1.5559105431309903, + "grad_norm": 0.039178211241960526, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1948 + }, + { + "epoch": 1.5567092651757188, + "grad_norm": 0.0431843139231205, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1949 + }, + { + "epoch": 1.5575079872204474, + "grad_norm": 0.06331207603216171, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1950 + }, + { + "epoch": 1.5583067092651757, + "grad_norm": 0.0670275092124939, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1951 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 0.04372883588075638, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1952 + }, + { + "epoch": 1.5599041533546325, + "grad_norm": 0.15768256783485413, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1953 + }, + { + "epoch": 1.560702875399361, + "grad_norm": 0.30828192830085754, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1954 + }, + { + "epoch": 1.5615015974440896, + "grad_norm": 0.3741140365600586, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1955 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 0.25689223408699036, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1956 + }, + { + "epoch": 1.5630990415335462, + "grad_norm": 0.0691552683711052, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1957 + }, + { + "epoch": 1.5638977635782747, + "grad_norm": 0.2742094099521637, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1958 + }, + { + "epoch": 1.5646964856230032, + "grad_norm": 0.2760325074195862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1959 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 0.09094057232141495, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1960 + }, + { + "epoch": 1.56629392971246, + "grad_norm": 0.11926092952489853, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1961 + }, + { + "epoch": 1.5670926517571884, + "grad_norm": 0.18398839235305786, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1962 + }, + { + "epoch": 1.567891373801917, + "grad_norm": 0.17090962827205658, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1963 + }, + { + "epoch": 1.5686900958466454, + "grad_norm": 0.07806222885847092, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1964 + }, + { + "epoch": 1.569488817891374, + "grad_norm": 0.17260140180587769, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1965 + }, + { + "epoch": 1.5702875399361023, + "grad_norm": 0.2848401665687561, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1966 + }, + { + "epoch": 1.5710862619808306, + "grad_norm": 0.19075879454612732, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1967 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 0.044234778732061386, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 1968 + }, + { + "epoch": 1.5726837060702876, + "grad_norm": 0.16188788414001465, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1969 + }, + { + "epoch": 1.573482428115016, + "grad_norm": 0.19148766994476318, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1970 + }, + { + "epoch": 1.5742811501597445, + "grad_norm": 0.11576604843139648, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1971 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 0.049716517329216, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1972 + }, + { + "epoch": 1.5758785942492013, + "grad_norm": 0.12528614699840546, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1973 + }, + { + "epoch": 1.5766773162939298, + "grad_norm": 0.1574268341064453, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1974 + }, + { + "epoch": 1.5774760383386581, + "grad_norm": 0.06606525182723999, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1975 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 0.16142094135284424, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1976 + }, + { + "epoch": 1.579073482428115, + "grad_norm": 0.29769718647003174, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1977 + }, + { + "epoch": 1.5798722044728435, + "grad_norm": 0.20111548900604248, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1978 + }, + { + "epoch": 1.580670926517572, + "grad_norm": 0.06375493854284286, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1979 + }, + { + "epoch": 1.5814696485623003, + "grad_norm": 0.2208068072795868, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1980 + }, + { + "epoch": 1.5822683706070286, + "grad_norm": 0.2920839488506317, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1981 + }, + { + "epoch": 1.5830670926517572, + "grad_norm": 0.2115958034992218, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1982 + }, + { + "epoch": 1.5838658146964857, + "grad_norm": 0.048249468207359314, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1983 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 0.15551301836967468, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1984 + }, + { + "epoch": 1.5854632587859425, + "grad_norm": 0.2190883755683899, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 1985 + }, + { + "epoch": 1.5862619808306708, + "grad_norm": 0.15155111253261566, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1986 + }, + { + "epoch": 1.5870607028753994, + "grad_norm": 0.056616391986608505, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1987 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 0.1638905555009842, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1988 + }, + { + "epoch": 1.5886581469648562, + "grad_norm": 0.11643283069133759, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1989 + }, + { + "epoch": 1.5894568690095847, + "grad_norm": 0.06423045694828033, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1990 + }, + { + "epoch": 1.590255591054313, + "grad_norm": 0.11044095456600189, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1991 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 0.11911707371473312, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1992 + }, + { + "epoch": 1.59185303514377, + "grad_norm": 0.045604925602674484, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1993 + }, + { + "epoch": 1.5926517571884984, + "grad_norm": 0.10280558466911316, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1994 + }, + { + "epoch": 1.5934504792332267, + "grad_norm": 0.13807371258735657, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1995 + }, + { + "epoch": 1.5942492012779552, + "grad_norm": 0.06163270026445389, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1996 + }, + { + "epoch": 1.5950479233226837, + "grad_norm": 0.12899963557720184, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1997 + }, + { + "epoch": 1.5958466453674123, + "grad_norm": 0.24358411133289337, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 1998 + }, + { + "epoch": 1.5966453674121406, + "grad_norm": 0.23341934382915497, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1999 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 0.11766334623098373, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2000 + }, + { + "epoch": 1.5982428115015974, + "grad_norm": 0.07918071746826172, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2001 + }, + { + "epoch": 1.599041533546326, + "grad_norm": 0.1473437398672104, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2002 + }, + { + "epoch": 1.5998402555910545, + "grad_norm": 0.08945708721876144, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2003 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.06553255021572113, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2004 + }, + { + "epoch": 1.601437699680511, + "grad_norm": 0.12708786129951477, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2005 + }, + { + "epoch": 1.6022364217252396, + "grad_norm": 0.16935905814170837, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2006 + }, + { + "epoch": 1.6030351437699681, + "grad_norm": 0.10428016632795334, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2007 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 0.06016766279935837, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 2008 + }, + { + "epoch": 1.604632587859425, + "grad_norm": 0.1563751995563507, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2009 + }, + { + "epoch": 1.6054313099041533, + "grad_norm": 0.1919829398393631, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2010 + }, + { + "epoch": 1.6062300319488818, + "grad_norm": 0.14739179611206055, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2011 + }, + { + "epoch": 1.6070287539936103, + "grad_norm": 0.08086550235748291, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2012 + }, + { + "epoch": 1.6078274760383386, + "grad_norm": 0.06594815105199814, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2013 + }, + { + "epoch": 1.608626198083067, + "grad_norm": 0.10502789169549942, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2014 + }, + { + "epoch": 1.6094249201277955, + "grad_norm": 0.1312190145254135, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2015 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 0.062411367893218994, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2016 + }, + { + "epoch": 1.6110223642172525, + "grad_norm": 0.04986036196351051, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2017 + }, + { + "epoch": 1.6118210862619808, + "grad_norm": 0.08428573608398438, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2018 + }, + { + "epoch": 1.6126198083067091, + "grad_norm": 0.11552372574806213, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2019 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 0.07657046616077423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2020 + }, + { + "epoch": 1.6142172523961662, + "grad_norm": 0.05540962517261505, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 2021 + }, + { + "epoch": 1.6150159744408947, + "grad_norm": 0.048573557287454605, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2022 + }, + { + "epoch": 1.615814696485623, + "grad_norm": 0.08630840480327606, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2023 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 0.06090754270553589, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2024 + }, + { + "epoch": 1.6174121405750799, + "grad_norm": 0.05828041955828667, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2025 + }, + { + "epoch": 1.6182108626198084, + "grad_norm": 0.12483426928520203, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2026 + }, + { + "epoch": 1.6190095846645367, + "grad_norm": 0.13772840797901154, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2027 + }, + { + "epoch": 1.619808306709265, + "grad_norm": 0.08477568626403809, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2028 + }, + { + "epoch": 1.6206070287539935, + "grad_norm": 0.037577688694000244, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2029 + }, + { + "epoch": 1.621405750798722, + "grad_norm": 0.07961893081665039, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2030 + }, + { + "epoch": 1.6222044728434506, + "grad_norm": 0.06744182854890823, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2031 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 0.06228869408369064, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2032 + }, + { + "epoch": 1.6238019169329072, + "grad_norm": 0.1972920298576355, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2033 + }, + { + "epoch": 1.6246006389776357, + "grad_norm": 0.2701529562473297, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2034 + }, + { + "epoch": 1.6253993610223643, + "grad_norm": 0.20371970534324646, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2035 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 0.08887646347284317, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2036 + }, + { + "epoch": 1.626996805111821, + "grad_norm": 0.06480003893375397, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2037 + }, + { + "epoch": 1.6277955271565494, + "grad_norm": 0.089780792593956, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2038 + }, + { + "epoch": 1.628594249201278, + "grad_norm": 0.04014933854341507, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2039 + }, + { + "epoch": 1.6293929712460065, + "grad_norm": 0.0993470847606659, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 2040 + }, + { + "epoch": 1.630191693290735, + "grad_norm": 0.1957429200410843, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2041 + }, + { + "epoch": 1.6309904153354633, + "grad_norm": 0.2273249477148056, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2042 + }, + { + "epoch": 1.6317891373801916, + "grad_norm": 0.1936638057231903, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2043 + }, + { + "epoch": 1.6325878594249201, + "grad_norm": 0.10150687396526337, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2044 + }, + { + "epoch": 1.6333865814696487, + "grad_norm": 0.051224563270807266, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2045 + }, + { + "epoch": 1.634185303514377, + "grad_norm": 0.13044138252735138, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2046 + }, + { + "epoch": 1.6349840255591053, + "grad_norm": 0.16140064597129822, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2047 + }, + { + "epoch": 1.6357827476038338, + "grad_norm": 0.13187173008918762, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2048 + }, + { + "epoch": 1.6365814696485623, + "grad_norm": 0.03873397782444954, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2049 + }, + { + "epoch": 1.6373801916932909, + "grad_norm": 0.0575883649289608, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2050 + }, + { + "epoch": 1.6381789137380192, + "grad_norm": 0.039476748555898666, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 2051 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 0.06802869588136673, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2052 + }, + { + "epoch": 1.639776357827476, + "grad_norm": 0.059946198016405106, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2053 + }, + { + "epoch": 1.6405750798722045, + "grad_norm": 0.05185665935277939, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2054 + }, + { + "epoch": 1.641373801916933, + "grad_norm": 0.08230192214250565, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2055 + }, + { + "epoch": 1.6421725239616614, + "grad_norm": 0.10175196081399918, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2056 + }, + { + "epoch": 1.6429712460063897, + "grad_norm": 0.07616171985864639, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2057 + }, + { + "epoch": 1.6437699680511182, + "grad_norm": 0.4597811698913574, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2058 + }, + { + "epoch": 1.6445686900958467, + "grad_norm": 0.12450811266899109, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2059 + }, + { + "epoch": 1.645367412140575, + "grad_norm": 0.10847678035497665, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2060 + }, + { + "epoch": 1.6461661341853036, + "grad_norm": 0.05778864026069641, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2061 + }, + { + "epoch": 1.6469648562300319, + "grad_norm": 0.04321129992604256, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2062 + }, + { + "epoch": 1.6477635782747604, + "grad_norm": 0.05467045307159424, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2063 + }, + { + "epoch": 1.648562300319489, + "grad_norm": 0.044298864901065826, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2064 + }, + { + "epoch": 1.6493610223642172, + "grad_norm": 0.03863062337040901, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2065 + }, + { + "epoch": 1.6501597444089455, + "grad_norm": 0.04040979593992233, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2066 + }, + { + "epoch": 1.650958466453674, + "grad_norm": 0.03647322207689285, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2067 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 0.049459293484687805, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2068 + }, + { + "epoch": 1.6525559105431311, + "grad_norm": 0.052851296961307526, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2069 + }, + { + "epoch": 1.6533546325878594, + "grad_norm": 0.10360822081565857, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2070 + }, + { + "epoch": 1.6541533546325877, + "grad_norm": 0.18817105889320374, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2071 + }, + { + "epoch": 1.6549520766773163, + "grad_norm": 0.1711605340242386, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2072 + }, + { + "epoch": 1.6557507987220448, + "grad_norm": 0.08807278424501419, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2073 + }, + { + "epoch": 1.6565495207667733, + "grad_norm": 0.0631125420331955, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2074 + }, + { + "epoch": 1.6573482428115016, + "grad_norm": 0.17277394235134125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2075 + }, + { + "epoch": 1.65814696485623, + "grad_norm": 0.2353454977273941, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2076 + }, + { + "epoch": 1.6589456869009584, + "grad_norm": 0.18835891783237457, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2077 + }, + { + "epoch": 1.659744408945687, + "grad_norm": 0.08717352151870728, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2078 + }, + { + "epoch": 1.6605431309904153, + "grad_norm": 0.05640486627817154, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2079 + }, + { + "epoch": 1.6613418530351438, + "grad_norm": 0.11206189543008804, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2080 + }, + { + "epoch": 1.6621405750798721, + "grad_norm": 0.10098055750131607, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2081 + }, + { + "epoch": 1.6629392971246006, + "grad_norm": 0.04627184569835663, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2082 + }, + { + "epoch": 1.6637380191693292, + "grad_norm": 0.13048212230205536, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2083 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 0.22329512238502502, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2084 + }, + { + "epoch": 1.6653354632587858, + "grad_norm": 0.23544666171073914, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2085 + }, + { + "epoch": 1.6661341853035143, + "grad_norm": 0.1329459846019745, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2086 + }, + { + "epoch": 1.6669329073482428, + "grad_norm": 0.07398947328329086, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2087 + }, + { + "epoch": 1.6677316293929714, + "grad_norm": 0.1926809549331665, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2088 + }, + { + "epoch": 1.6685303514376997, + "grad_norm": 0.19097647070884705, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2089 + }, + { + "epoch": 1.669329073482428, + "grad_norm": 0.10474745184183121, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2090 + }, + { + "epoch": 1.6701277955271565, + "grad_norm": 0.04437112435698509, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2091 + }, + { + "epoch": 1.670926517571885, + "grad_norm": 0.13698135316371918, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2092 + }, + { + "epoch": 1.6717252396166136, + "grad_norm": 0.14437462389469147, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2093 + }, + { + "epoch": 1.6725239616613419, + "grad_norm": 0.0938732922077179, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2094 + }, + { + "epoch": 1.6733226837060702, + "grad_norm": 0.060729511082172394, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2095 + }, + { + "epoch": 1.6741214057507987, + "grad_norm": 0.05354619398713112, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2096 + }, + { + "epoch": 1.6749201277955272, + "grad_norm": 0.056909799575805664, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2097 + }, + { + "epoch": 1.6757188498402555, + "grad_norm": 0.09815286099910736, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2098 + }, + { + "epoch": 1.676517571884984, + "grad_norm": 0.1432102620601654, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2099 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 0.14039601385593414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2100 + }, + { + "epoch": 1.678115015974441, + "grad_norm": 0.06634008139371872, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2101 + }, + { + "epoch": 1.6789137380191694, + "grad_norm": 0.1347021609544754, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2102 + }, + { + "epoch": 1.6797124600638977, + "grad_norm": 0.24721868336200714, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2103 + }, + { + "epoch": 1.680511182108626, + "grad_norm": 0.23194770514965057, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2104 + }, + { + "epoch": 1.6813099041533546, + "grad_norm": 0.12276436388492584, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2105 + }, + { + "epoch": 1.682108626198083, + "grad_norm": 0.06224825233221054, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2106 + }, + { + "epoch": 1.6829073482428116, + "grad_norm": 0.20683766901493073, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2107 + }, + { + "epoch": 1.68370607028754, + "grad_norm": 0.26914462447166443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2108 + }, + { + "epoch": 1.6845047923322682, + "grad_norm": 0.20070654153823853, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2109 + }, + { + "epoch": 1.6853035143769968, + "grad_norm": 0.08465532958507538, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2110 + }, + { + "epoch": 1.6861022364217253, + "grad_norm": 0.10843367129564285, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2111 + }, + { + "epoch": 1.6869009584664538, + "grad_norm": 0.20252646505832672, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2112 + }, + { + "epoch": 1.6876996805111821, + "grad_norm": 0.11803672462701797, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2113 + }, + { + "epoch": 1.6884984025559104, + "grad_norm": 0.08800901472568512, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2114 + }, + { + "epoch": 1.689297124600639, + "grad_norm": 0.23917800188064575, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2115 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 0.21528035402297974, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2116 + }, + { + "epoch": 1.6908945686900958, + "grad_norm": 0.05292942747473717, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2117 + }, + { + "epoch": 1.6916932907348243, + "grad_norm": 0.12942583858966827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2118 + }, + { + "epoch": 1.6924920127795526, + "grad_norm": 0.19304881989955902, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2119 + }, + { + "epoch": 1.6932907348242812, + "grad_norm": 0.10951094329357147, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 2120 + }, + { + "epoch": 1.6940894568690097, + "grad_norm": 0.07684643566608429, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2121 + }, + { + "epoch": 1.694888178913738, + "grad_norm": 0.14990608394145966, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2122 + }, + { + "epoch": 1.6956869009584663, + "grad_norm": 0.1104716882109642, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2123 + }, + { + "epoch": 1.6964856230031948, + "grad_norm": 0.06538088619709015, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2124 + }, + { + "epoch": 1.6972843450479234, + "grad_norm": 0.05474448576569557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2125 + }, + { + "epoch": 1.6980830670926519, + "grad_norm": 0.0803864449262619, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2126 + }, + { + "epoch": 1.6988817891373802, + "grad_norm": 0.04384651407599449, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2127 + }, + { + "epoch": 1.6996805111821085, + "grad_norm": 0.07006746530532837, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 2128 + }, + { + "epoch": 1.700479233226837, + "grad_norm": 0.08840122073888779, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2129 + }, + { + "epoch": 1.7012779552715656, + "grad_norm": 0.06421404331922531, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2130 + }, + { + "epoch": 1.702076677316294, + "grad_norm": 0.03711751103401184, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2131 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 0.06725160032510757, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2132 + }, + { + "epoch": 1.7036741214057507, + "grad_norm": 0.0517839640378952, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2133 + }, + { + "epoch": 1.7044728434504792, + "grad_norm": 0.046399205923080444, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2134 + }, + { + "epoch": 1.7052715654952078, + "grad_norm": 0.05188435688614845, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2135 + }, + { + "epoch": 1.706070287539936, + "grad_norm": 0.08578629791736603, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2136 + }, + { + "epoch": 1.7068690095846646, + "grad_norm": 0.07895999401807785, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2137 + }, + { + "epoch": 1.707667731629393, + "grad_norm": 0.060662928968667984, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2138 + }, + { + "epoch": 1.7084664536741214, + "grad_norm": 0.08372191339731216, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2139 + }, + { + "epoch": 1.70926517571885, + "grad_norm": 0.1217966303229332, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2140 + }, + { + "epoch": 1.7100638977635783, + "grad_norm": 0.14054186642169952, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2141 + }, + { + "epoch": 1.7108626198083066, + "grad_norm": 0.11693520098924637, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2142 + }, + { + "epoch": 1.711661341853035, + "grad_norm": 0.04271163418889046, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2143 + }, + { + "epoch": 1.7124600638977636, + "grad_norm": 0.11898874491453171, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2144 + }, + { + "epoch": 1.7132587859424921, + "grad_norm": 0.2637499272823334, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2145 + }, + { + "epoch": 1.7140575079872205, + "grad_norm": 0.29218390583992004, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2146 + }, + { + "epoch": 1.7148562300319488, + "grad_norm": 0.1899375170469284, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2147 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 0.04336607828736305, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2148 + }, + { + "epoch": 1.7164536741214058, + "grad_norm": 0.14123578369617462, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2149 + }, + { + "epoch": 1.7172523961661343, + "grad_norm": 0.19930055737495422, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2150 + }, + { + "epoch": 1.7180511182108626, + "grad_norm": 0.1796298772096634, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2151 + }, + { + "epoch": 1.718849840255591, + "grad_norm": 0.07607068121433258, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2152 + }, + { + "epoch": 1.7196485623003195, + "grad_norm": 0.12980210781097412, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2153 + }, + { + "epoch": 1.720447284345048, + "grad_norm": 0.2507205009460449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2154 + }, + { + "epoch": 1.7212460063897763, + "grad_norm": 0.2388920783996582, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2155 + }, + { + "epoch": 1.7220447284345048, + "grad_norm": 0.13363847136497498, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 1.7228434504792332, + "grad_norm": 0.048030026257038116, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 2157 + }, + { + "epoch": 1.7236421725239617, + "grad_norm": 0.14619708061218262, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2158 + }, + { + "epoch": 1.7244408945686902, + "grad_norm": 0.22031216323375702, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2159 + }, + { + "epoch": 1.7252396166134185, + "grad_norm": 0.18440701067447662, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2160 + }, + { + "epoch": 1.7260383386581468, + "grad_norm": 0.08183866739273071, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2161 + }, + { + "epoch": 1.7268370607028753, + "grad_norm": 0.05314984545111656, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2162 + }, + { + "epoch": 1.7276357827476039, + "grad_norm": 0.1438753753900528, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2163 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 0.0881122425198555, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 2164 + }, + { + "epoch": 1.7292332268370607, + "grad_norm": 0.1165589690208435, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2165 + }, + { + "epoch": 1.730031948881789, + "grad_norm": 0.14884884655475616, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2166 + }, + { + "epoch": 1.7308306709265175, + "grad_norm": 0.10219287127256393, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2167 + }, + { + "epoch": 1.731629392971246, + "grad_norm": 0.059794824570417404, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2168 + }, + { + "epoch": 1.7324281150159746, + "grad_norm": 0.0538945347070694, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2169 + }, + { + "epoch": 1.733226837060703, + "grad_norm": 0.1016303226351738, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2170 + }, + { + "epoch": 1.7340255591054312, + "grad_norm": 0.058912694454193115, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2171 + }, + { + "epoch": 1.7348242811501597, + "grad_norm": 0.060018621385097504, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2172 + }, + { + "epoch": 1.7356230031948883, + "grad_norm": 0.05386706069111824, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2173 + }, + { + "epoch": 1.7364217252396166, + "grad_norm": 0.06266453117132187, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2174 + }, + { + "epoch": 1.7372204472843449, + "grad_norm": 0.1035243570804596, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 2175 + }, + { + "epoch": 1.7380191693290734, + "grad_norm": 0.17216888070106506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2176 + }, + { + "epoch": 1.738817891373802, + "grad_norm": 0.23428532481193542, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2177 + }, + { + "epoch": 1.7396166134185305, + "grad_norm": 0.21038073301315308, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2178 + }, + { + "epoch": 1.7404153354632588, + "grad_norm": 0.1487000286579132, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2179 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 0.03916196525096893, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2180 + }, + { + "epoch": 1.7420127795527156, + "grad_norm": 0.13702991604804993, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2181 + }, + { + "epoch": 1.7428115015974441, + "grad_norm": 0.21363528072834015, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2182 + }, + { + "epoch": 1.7436102236421727, + "grad_norm": 0.134271502494812, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2183 + }, + { + "epoch": 1.744408945686901, + "grad_norm": 0.062452565878629684, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2184 + }, + { + "epoch": 1.7452076677316293, + "grad_norm": 0.1745995730161667, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2185 + }, + { + "epoch": 1.7460063897763578, + "grad_norm": 0.19709894061088562, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2186 + }, + { + "epoch": 1.7468051118210863, + "grad_norm": 0.1201571598649025, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2187 + }, + { + "epoch": 1.7476038338658149, + "grad_norm": 0.03690087050199509, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2188 + }, + { + "epoch": 1.7484025559105432, + "grad_norm": 0.1387440711259842, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2189 + }, + { + "epoch": 1.7492012779552715, + "grad_norm": 0.2084781676530838, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2190 + }, + { + "epoch": 1.75, + "grad_norm": 0.17941167950630188, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2191 + }, + { + "epoch": 1.7507987220447285, + "grad_norm": 0.09751889854669571, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2192 + }, + { + "epoch": 1.7515974440894568, + "grad_norm": 0.04116421565413475, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2193 + }, + { + "epoch": 1.7523961661341851, + "grad_norm": 0.14683429896831512, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2194 + }, + { + "epoch": 1.7531948881789137, + "grad_norm": 0.19602352380752563, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2195 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 0.18503598868846893, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2196 + }, + { + "epoch": 1.7547923322683707, + "grad_norm": 0.09473808109760284, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2197 + }, + { + "epoch": 1.755591054313099, + "grad_norm": 0.05645129457116127, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2198 + }, + { + "epoch": 1.7563897763578273, + "grad_norm": 0.09260818362236023, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2199 + }, + { + "epoch": 1.7571884984025559, + "grad_norm": 0.045891985297203064, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2200 + }, + { + "epoch": 1.7579872204472844, + "grad_norm": 0.125623419880867, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2201 + }, + { + "epoch": 1.758785942492013, + "grad_norm": 0.18919512629508972, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2202 + }, + { + "epoch": 1.7595846645367412, + "grad_norm": 0.17549264430999756, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2203 + }, + { + "epoch": 1.7603833865814695, + "grad_norm": 0.047342319041490555, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 2204 + }, + { + "epoch": 1.761182108626198, + "grad_norm": 0.177268847823143, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2205 + }, + { + "epoch": 1.7619808306709266, + "grad_norm": 0.28258222341537476, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2206 + }, + { + "epoch": 1.7627795527156551, + "grad_norm": 0.25111353397369385, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2207 + }, + { + "epoch": 1.7635782747603834, + "grad_norm": 0.11864849925041199, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2208 + }, + { + "epoch": 1.7643769968051117, + "grad_norm": 0.06387785822153091, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2209 + }, + { + "epoch": 1.7651757188498403, + "grad_norm": 0.1264238804578781, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2210 + }, + { + "epoch": 1.7659744408945688, + "grad_norm": 0.12080882489681244, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2211 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.05618004873394966, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2212 + }, + { + "epoch": 1.7675718849840254, + "grad_norm": 0.06543037295341492, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2213 + }, + { + "epoch": 1.768370607028754, + "grad_norm": 0.08525256812572479, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2214 + }, + { + "epoch": 1.7691693290734825, + "grad_norm": 0.08571972697973251, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2215 + }, + { + "epoch": 1.769968051118211, + "grad_norm": 0.04897582530975342, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2216 + }, + { + "epoch": 1.7707667731629393, + "grad_norm": 0.07296427339315414, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2217 + }, + { + "epoch": 1.7715654952076676, + "grad_norm": 0.041904110461473465, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2218 + }, + { + "epoch": 1.7723642172523961, + "grad_norm": 0.053191233426332474, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2219 + }, + { + "epoch": 1.7731629392971247, + "grad_norm": 0.056369587779045105, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2220 + }, + { + "epoch": 1.7739616613418532, + "grad_norm": 0.06455157697200775, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2221 + }, + { + "epoch": 1.7747603833865815, + "grad_norm": 0.06467561423778534, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2222 + }, + { + "epoch": 1.7755591054313098, + "grad_norm": 0.07162238657474518, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2223 + }, + { + "epoch": 1.7763578274760383, + "grad_norm": 0.045193906873464584, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2224 + }, + { + "epoch": 1.7771565495207668, + "grad_norm": 0.07172992080450058, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2225 + }, + { + "epoch": 1.7779552715654952, + "grad_norm": 0.07163143157958984, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2226 + }, + { + "epoch": 1.7787539936102237, + "grad_norm": 0.11480346322059631, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2227 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 0.21525998413562775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 2228 + }, + { + "epoch": 1.7803514376996805, + "grad_norm": 0.20769886672496796, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2229 + }, + { + "epoch": 1.781150159744409, + "grad_norm": 0.13149204850196838, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2230 + }, + { + "epoch": 1.7819488817891374, + "grad_norm": 0.06223989278078079, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2231 + }, + { + "epoch": 1.7827476038338657, + "grad_norm": 0.11386150866746902, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2232 + }, + { + "epoch": 1.7835463258785942, + "grad_norm": 0.1448865532875061, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2233 + }, + { + "epoch": 1.7843450479233227, + "grad_norm": 0.11244893074035645, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2234 + }, + { + "epoch": 1.7851437699680512, + "grad_norm": 0.06307587027549744, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2235 + }, + { + "epoch": 1.7859424920127795, + "grad_norm": 0.1529018133878708, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2236 + }, + { + "epoch": 1.7867412140575079, + "grad_norm": 0.212649405002594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2237 + }, + { + "epoch": 1.7875399361022364, + "grad_norm": 0.18361856043338776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2238 + }, + { + "epoch": 1.788338658146965, + "grad_norm": 0.06960433721542358, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2239 + }, + { + "epoch": 1.7891373801916934, + "grad_norm": 0.13445821404457092, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2240 + }, + { + "epoch": 1.7899361022364217, + "grad_norm": 0.24758578836917877, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2241 + }, + { + "epoch": 1.79073482428115, + "grad_norm": 0.27208608388900757, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2242 + }, + { + "epoch": 1.7915335463258786, + "grad_norm": 0.1256505697965622, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2243 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 0.12209334224462509, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2244 + }, + { + "epoch": 1.7931309904153354, + "grad_norm": 0.2690032720565796, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2245 + }, + { + "epoch": 1.793929712460064, + "grad_norm": 0.27393221855163574, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2246 + }, + { + "epoch": 1.7947284345047922, + "grad_norm": 0.12508991360664368, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 2247 + }, + { + "epoch": 1.7955271565495208, + "grad_norm": 0.10001108795404434, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2248 + }, + { + "epoch": 1.7963258785942493, + "grad_norm": 0.2588697373867035, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2249 + }, + { + "epoch": 1.7971246006389776, + "grad_norm": 0.24723860621452332, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2250 + }, + { + "epoch": 1.797923322683706, + "grad_norm": 0.09018664062023163, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2251 + }, + { + "epoch": 1.7987220447284344, + "grad_norm": 0.09745316952466965, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 2252 + }, + { + "epoch": 1.799520766773163, + "grad_norm": 0.20877481997013092, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 2253 + }, + { + "epoch": 1.8003194888178915, + "grad_norm": 0.24291004240512848, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2254 + }, + { + "epoch": 1.8011182108626198, + "grad_norm": 0.1967754364013672, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2255 + }, + { + "epoch": 1.8019169329073481, + "grad_norm": 0.088215172290802, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2256 + }, + { + "epoch": 1.8027156549520766, + "grad_norm": 0.07018816471099854, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2257 + }, + { + "epoch": 1.8035143769968052, + "grad_norm": 0.17161858081817627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2258 + }, + { + "epoch": 1.8043130990415337, + "grad_norm": 0.22007174789905548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2259 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 0.16093726456165314, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2260 + }, + { + "epoch": 1.8059105431309903, + "grad_norm": 0.06763539463281631, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2261 + }, + { + "epoch": 1.8067092651757188, + "grad_norm": 0.1066257432103157, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2262 + }, + { + "epoch": 1.8075079872204474, + "grad_norm": 0.17658250033855438, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2263 + }, + { + "epoch": 1.8083067092651757, + "grad_norm": 0.21157506108283997, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2264 + }, + { + "epoch": 1.8091054313099042, + "grad_norm": 0.16717523336410522, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2265 + }, + { + "epoch": 1.8099041533546325, + "grad_norm": 0.08356527984142303, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2266 + }, + { + "epoch": 1.810702875399361, + "grad_norm": 0.11939100921154022, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2267 + }, + { + "epoch": 1.8115015974440896, + "grad_norm": 0.2322039157152176, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2268 + }, + { + "epoch": 1.8123003194888179, + "grad_norm": 0.2277170568704605, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2269 + }, + { + "epoch": 1.8130990415335462, + "grad_norm": 0.06634530425071716, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2270 + }, + { + "epoch": 1.8138977635782747, + "grad_norm": 0.20808424055576324, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2271 + }, + { + "epoch": 1.8146964856230032, + "grad_norm": 0.3761717975139618, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2272 + }, + { + "epoch": 1.8154952076677318, + "grad_norm": 0.3587193191051483, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2273 + }, + { + "epoch": 1.81629392971246, + "grad_norm": 0.12116564810276031, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2274 + }, + { + "epoch": 1.8170926517571884, + "grad_norm": 0.20137764513492584, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2275 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 0.30456987023353577, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2276 + }, + { + "epoch": 1.8186900958466454, + "grad_norm": 0.15625369548797607, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2277 + }, + { + "epoch": 1.819488817891374, + "grad_norm": 0.12682494521141052, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2278 + }, + { + "epoch": 1.8202875399361023, + "grad_norm": 0.26252153515815735, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2279 + }, + { + "epoch": 1.8210862619808306, + "grad_norm": 0.17610949277877808, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2280 + }, + { + "epoch": 1.821884984025559, + "grad_norm": 0.056205663830041885, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2281 + }, + { + "epoch": 1.8226837060702876, + "grad_norm": 0.1519095003604889, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2282 + }, + { + "epoch": 1.823482428115016, + "grad_norm": 0.1591203212738037, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2283 + }, + { + "epoch": 1.8242811501597445, + "grad_norm": 0.11261039227247238, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2284 + }, + { + "epoch": 1.8250798722044728, + "grad_norm": 0.06855058670043945, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2285 + }, + { + "epoch": 1.8258785942492013, + "grad_norm": 0.04728224128484726, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2286 + }, + { + "epoch": 1.8266773162939298, + "grad_norm": 0.0677042305469513, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2287 + }, + { + "epoch": 1.8274760383386581, + "grad_norm": 0.0836048573255539, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2288 + }, + { + "epoch": 1.8282747603833864, + "grad_norm": 0.0657985508441925, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2289 + }, + { + "epoch": 1.829073482428115, + "grad_norm": 0.05567999184131622, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2290 + }, + { + "epoch": 1.8298722044728435, + "grad_norm": 0.13710817694664001, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2291 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 0.14417411386966705, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2292 + }, + { + "epoch": 1.8314696485623003, + "grad_norm": 0.12273317575454712, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2293 + }, + { + "epoch": 1.8322683706070286, + "grad_norm": 0.12350328266620636, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2294 + }, + { + "epoch": 1.8330670926517572, + "grad_norm": 0.12832887470722198, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2295 + }, + { + "epoch": 1.8338658146964857, + "grad_norm": 0.17759868502616882, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2296 + }, + { + "epoch": 1.8346645367412142, + "grad_norm": 0.18485887348651886, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2297 + }, + { + "epoch": 1.8354632587859425, + "grad_norm": 0.11906488239765167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2298 + }, + { + "epoch": 1.8362619808306708, + "grad_norm": 0.04088319092988968, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2299 + }, + { + "epoch": 1.8370607028753994, + "grad_norm": 0.18988807499408722, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2300 + }, + { + "epoch": 1.8378594249201279, + "grad_norm": 0.2758033275604248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2301 + }, + { + "epoch": 1.8386581469648562, + "grad_norm": 0.26860401034355164, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2302 + }, + { + "epoch": 1.8394568690095847, + "grad_norm": 0.1770019680261612, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2303 + }, + { + "epoch": 1.840255591054313, + "grad_norm": 0.03740993142127991, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2304 + }, + { + "epoch": 1.8410543130990416, + "grad_norm": 0.13697518408298492, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2305 + }, + { + "epoch": 1.84185303514377, + "grad_norm": 0.15273790061473846, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2306 + }, + { + "epoch": 1.8426517571884984, + "grad_norm": 0.08181154727935791, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2307 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.05599624291062355, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2308 + }, + { + "epoch": 1.8442492012779552, + "grad_norm": 0.17429251968860626, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2309 + }, + { + "epoch": 1.8450479233226837, + "grad_norm": 0.20159491896629333, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2310 + }, + { + "epoch": 1.8458466453674123, + "grad_norm": 0.10825419425964355, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2311 + }, + { + "epoch": 1.8466453674121406, + "grad_norm": 0.0784185528755188, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2312 + }, + { + "epoch": 1.8474440894568689, + "grad_norm": 0.15851987898349762, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2313 + }, + { + "epoch": 1.8482428115015974, + "grad_norm": 0.11244971305131912, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2314 + }, + { + "epoch": 1.849041533546326, + "grad_norm": 0.04119047150015831, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2315 + }, + { + "epoch": 1.8498402555910545, + "grad_norm": 0.12872102856636047, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2316 + }, + { + "epoch": 1.8506389776357828, + "grad_norm": 0.1542259305715561, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2317 + }, + { + "epoch": 1.851437699680511, + "grad_norm": 0.09662868827581406, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2318 + }, + { + "epoch": 1.8522364217252396, + "grad_norm": 0.04452383890748024, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2319 + }, + { + "epoch": 1.8530351437699681, + "grad_norm": 0.03368959203362465, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2320 + }, + { + "epoch": 1.8538338658146964, + "grad_norm": 0.05867767333984375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2321 + }, + { + "epoch": 1.854632587859425, + "grad_norm": 0.0774846225976944, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2322 + }, + { + "epoch": 1.8554313099041533, + "grad_norm": 0.05172058939933777, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2323 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 0.06597824394702911, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2324 + }, + { + "epoch": 1.8570287539936103, + "grad_norm": 0.10818778723478317, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2325 + }, + { + "epoch": 1.8578274760383386, + "grad_norm": 0.12698976695537567, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2326 + }, + { + "epoch": 1.858626198083067, + "grad_norm": 0.06547659635543823, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2327 + }, + { + "epoch": 1.8594249201277955, + "grad_norm": 0.08613643050193787, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2328 + }, + { + "epoch": 1.860223642172524, + "grad_norm": 0.23452800512313843, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2329 + }, + { + "epoch": 1.8610223642172525, + "grad_norm": 0.29293227195739746, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2330 + }, + { + "epoch": 1.8618210862619808, + "grad_norm": 0.17590634524822235, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2331 + }, + { + "epoch": 1.8626198083067091, + "grad_norm": 0.09830035269260406, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2332 + }, + { + "epoch": 1.8634185303514377, + "grad_norm": 0.2336016595363617, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2333 + }, + { + "epoch": 1.8642172523961662, + "grad_norm": 0.22990736365318298, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2334 + }, + { + "epoch": 1.8650159744408947, + "grad_norm": 0.14177313446998596, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2335 + }, + { + "epoch": 1.865814696485623, + "grad_norm": 0.07447824627161026, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2336 + }, + { + "epoch": 1.8666134185303513, + "grad_norm": 0.20551882684230804, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2337 + }, + { + "epoch": 1.8674121405750799, + "grad_norm": 0.21193428337574005, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2338 + }, + { + "epoch": 1.8682108626198084, + "grad_norm": 0.09889520704746246, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2339 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 0.06506047397851944, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2340 + }, + { + "epoch": 1.869808306709265, + "grad_norm": 0.10613662004470825, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2341 + }, + { + "epoch": 1.8706070287539935, + "grad_norm": 0.13049691915512085, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2342 + }, + { + "epoch": 1.871405750798722, + "grad_norm": 0.07257628440856934, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2343 + }, + { + "epoch": 1.8722044728434506, + "grad_norm": 0.05402761325240135, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2344 + }, + { + "epoch": 1.873003194888179, + "grad_norm": 0.1298513114452362, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2345 + }, + { + "epoch": 1.8738019169329072, + "grad_norm": 0.18854250013828278, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2346 + }, + { + "epoch": 1.8746006389776357, + "grad_norm": 0.18749283254146576, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2347 + }, + { + "epoch": 1.8753993610223643, + "grad_norm": 0.0791897177696228, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2348 + }, + { + "epoch": 1.8761980830670928, + "grad_norm": 0.061554014682769775, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2349 + }, + { + "epoch": 1.876996805111821, + "grad_norm": 0.07776489108800888, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2350 + }, + { + "epoch": 1.8777955271565494, + "grad_norm": 0.06406589597463608, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2351 + }, + { + "epoch": 1.878594249201278, + "grad_norm": 0.04364178702235222, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2352 + }, + { + "epoch": 1.8793929712460065, + "grad_norm": 0.14296351373195648, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2353 + }, + { + "epoch": 1.880191693290735, + "grad_norm": 0.23554368317127228, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2354 + }, + { + "epoch": 1.8809904153354633, + "grad_norm": 0.17022013664245605, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 2355 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 0.055340252816677094, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2356 + }, + { + "epoch": 1.8825878594249201, + "grad_norm": 0.10552496463060379, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2357 + }, + { + "epoch": 1.8833865814696487, + "grad_norm": 0.1601826697587967, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2358 + }, + { + "epoch": 1.884185303514377, + "grad_norm": 0.15029270946979523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2359 + }, + { + "epoch": 1.8849840255591053, + "grad_norm": 0.05186127871274948, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2360 + }, + { + "epoch": 1.8857827476038338, + "grad_norm": 0.10678224265575409, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2361 + }, + { + "epoch": 1.8865814696485623, + "grad_norm": 0.1380450427532196, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2362 + }, + { + "epoch": 1.8873801916932909, + "grad_norm": 0.08721969276666641, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2363 + }, + { + "epoch": 1.8881789137380192, + "grad_norm": 0.09425338357686996, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2364 + }, + { + "epoch": 1.8889776357827475, + "grad_norm": 0.16815589368343353, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2365 + }, + { + "epoch": 1.889776357827476, + "grad_norm": 0.16181580722332, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2366 + }, + { + "epoch": 1.8905750798722045, + "grad_norm": 0.054028045386075974, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2367 + }, + { + "epoch": 1.891373801916933, + "grad_norm": 0.07199764251708984, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2368 + }, + { + "epoch": 1.8921725239616614, + "grad_norm": 0.08493109047412872, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2369 + }, + { + "epoch": 1.8929712460063897, + "grad_norm": 0.09665308892726898, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 2370 + }, + { + "epoch": 1.8937699680511182, + "grad_norm": 0.07975895702838898, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2371 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 0.06089888513088226, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2372 + }, + { + "epoch": 1.895367412140575, + "grad_norm": 0.04610683396458626, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2373 + }, + { + "epoch": 1.8961661341853036, + "grad_norm": 0.06083180755376816, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2374 + }, + { + "epoch": 1.8969648562300319, + "grad_norm": 0.07177560776472092, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 2375 + }, + { + "epoch": 1.8977635782747604, + "grad_norm": 0.04214467853307724, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2376 + }, + { + "epoch": 1.898562300319489, + "grad_norm": 0.05166957527399063, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2377 + }, + { + "epoch": 1.8993610223642172, + "grad_norm": 0.040181614458560944, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2378 + }, + { + "epoch": 1.9001597444089455, + "grad_norm": 0.043485358357429504, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2379 + }, + { + "epoch": 1.900958466453674, + "grad_norm": 0.07395761460065842, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2380 + }, + { + "epoch": 1.9017571884984026, + "grad_norm": 0.05133877694606781, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 2381 + }, + { + "epoch": 1.9025559105431311, + "grad_norm": 0.059279292821884155, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2382 + }, + { + "epoch": 1.9033546325878594, + "grad_norm": 0.07573487609624863, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2383 + }, + { + "epoch": 1.9041533546325877, + "grad_norm": 0.07013942301273346, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2384 + }, + { + "epoch": 1.9049520766773163, + "grad_norm": 0.14524684846401215, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2385 + }, + { + "epoch": 1.9057507987220448, + "grad_norm": 0.17374426126480103, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2386 + }, + { + "epoch": 1.9065495207667733, + "grad_norm": 0.1387263685464859, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2387 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 0.045813702046871185, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2388 + }, + { + "epoch": 1.90814696485623, + "grad_norm": 0.189321830868721, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2389 + }, + { + "epoch": 1.9089456869009584, + "grad_norm": 0.261329710483551, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2390 + }, + { + "epoch": 1.909744408945687, + "grad_norm": 0.1599399596452713, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2391 + }, + { + "epoch": 1.9105431309904153, + "grad_norm": 0.03977127745747566, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2392 + }, + { + "epoch": 1.9113418530351438, + "grad_norm": 0.16269442439079285, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2393 + }, + { + "epoch": 1.9121405750798721, + "grad_norm": 0.22963251173496246, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2394 + }, + { + "epoch": 1.9129392971246006, + "grad_norm": 0.1526031792163849, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2395 + }, + { + "epoch": 1.9137380191693292, + "grad_norm": 0.07236737757921219, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 2396 + }, + { + "epoch": 1.9145367412140575, + "grad_norm": 0.19993482530117035, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2397 + }, + { + "epoch": 1.9153354632587858, + "grad_norm": 0.18950621783733368, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2398 + }, + { + "epoch": 1.9161341853035143, + "grad_norm": 0.10046153515577316, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2399 + }, + { + "epoch": 1.9169329073482428, + "grad_norm": 0.07884453237056732, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2400 + }, + { + "epoch": 1.9177316293929714, + "grad_norm": 0.23947227001190186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2401 + }, + { + "epoch": 1.9185303514376997, + "grad_norm": 0.2662964165210724, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2402 + }, + { + "epoch": 1.919329073482428, + "grad_norm": 0.1257917582988739, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2403 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 0.09092582017183304, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2404 + }, + { + "epoch": 1.920926517571885, + "grad_norm": 0.19677215814590454, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2405 + }, + { + "epoch": 1.9217252396166136, + "grad_norm": 0.17972320318222046, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2406 + }, + { + "epoch": 1.9225239616613419, + "grad_norm": 0.06155665963888168, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2407 + }, + { + "epoch": 1.9233226837060702, + "grad_norm": 0.14805591106414795, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2408 + }, + { + "epoch": 1.9241214057507987, + "grad_norm": 0.2414662092924118, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2409 + }, + { + "epoch": 1.9249201277955272, + "grad_norm": 0.2084181308746338, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2410 + }, + { + "epoch": 1.9257188498402555, + "grad_norm": 0.05523146688938141, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2411 + }, + { + "epoch": 1.926517571884984, + "grad_norm": 0.13994552195072174, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2412 + }, + { + "epoch": 1.9273162939297124, + "grad_norm": 0.2648966312408447, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2413 + }, + { + "epoch": 1.928115015974441, + "grad_norm": 0.28959497809410095, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2414 + }, + { + "epoch": 1.9289137380191694, + "grad_norm": 0.11457488685846329, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2415 + }, + { + "epoch": 1.9297124600638977, + "grad_norm": 0.12448041886091232, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2416 + }, + { + "epoch": 1.930511182108626, + "grad_norm": 0.20807982981204987, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2417 + }, + { + "epoch": 1.9313099041533546, + "grad_norm": 0.14537623524665833, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2418 + }, + { + "epoch": 1.932108626198083, + "grad_norm": 0.0428709015250206, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2419 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.07923824340105057, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2420 + }, + { + "epoch": 1.93370607028754, + "grad_norm": 0.06046072393655777, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2421 + }, + { + "epoch": 1.9345047923322682, + "grad_norm": 0.05921380594372749, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2422 + }, + { + "epoch": 1.9353035143769968, + "grad_norm": 0.05324951559305191, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2423 + }, + { + "epoch": 1.9361022364217253, + "grad_norm": 0.060725487768650055, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2424 + }, + { + "epoch": 1.9369009584664538, + "grad_norm": 0.09305386245250702, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2425 + }, + { + "epoch": 1.9376996805111821, + "grad_norm": 0.12314888834953308, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2426 + }, + { + "epoch": 1.9384984025559104, + "grad_norm": 0.08590805530548096, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2427 + }, + { + "epoch": 1.939297124600639, + "grad_norm": 0.07134587317705154, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2428 + }, + { + "epoch": 1.9400958466453675, + "grad_norm": 0.04584966599941254, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2429 + }, + { + "epoch": 1.9408945686900958, + "grad_norm": 0.050389841198921204, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2430 + }, + { + "epoch": 1.9416932907348243, + "grad_norm": 0.055894333869218826, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2431 + }, + { + "epoch": 1.9424920127795526, + "grad_norm": 0.05231403559446335, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2432 + }, + { + "epoch": 1.9432907348242812, + "grad_norm": 0.04235154017806053, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2433 + }, + { + "epoch": 1.9440894568690097, + "grad_norm": 0.038994334638118744, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2434 + }, + { + "epoch": 1.944888178913738, + "grad_norm": 0.062291134148836136, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2435 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.10267619043588638, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2436 + }, + { + "epoch": 1.9464856230031948, + "grad_norm": 0.12227646261453629, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2437 + }, + { + "epoch": 1.9472843450479234, + "grad_norm": 0.07677904516458511, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2438 + }, + { + "epoch": 1.9480830670926519, + "grad_norm": 0.043213456869125366, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2439 + }, + { + "epoch": 1.9488817891373802, + "grad_norm": 0.0464320071041584, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2440 + }, + { + "epoch": 1.9496805111821085, + "grad_norm": 0.0488814078271389, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2441 + }, + { + "epoch": 1.950479233226837, + "grad_norm": 0.07102649658918381, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2442 + }, + { + "epoch": 1.9512779552715656, + "grad_norm": 0.056355372071266174, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2443 + }, + { + "epoch": 1.952076677316294, + "grad_norm": 0.05412770435214043, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2444 + }, + { + "epoch": 1.9528753993610224, + "grad_norm": 0.05533284693956375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2445 + }, + { + "epoch": 1.9536741214057507, + "grad_norm": 0.07065420597791672, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2446 + }, + { + "epoch": 1.9544728434504792, + "grad_norm": 0.0424923375248909, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2447 + }, + { + "epoch": 1.9552715654952078, + "grad_norm": 0.07682394236326218, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2448 + }, + { + "epoch": 1.956070287539936, + "grad_norm": 0.12305673956871033, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2449 + }, + { + "epoch": 1.9568690095846646, + "grad_norm": 0.12699945271015167, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2450 + }, + { + "epoch": 1.957667731629393, + "grad_norm": 0.09973076730966568, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2451 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 0.04687270149588585, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2452 + }, + { + "epoch": 1.95926517571885, + "grad_norm": 0.16843228042125702, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2453 + }, + { + "epoch": 1.9600638977635783, + "grad_norm": 0.27191975712776184, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2454 + }, + { + "epoch": 1.9608626198083066, + "grad_norm": 0.2563989460468292, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2455 + }, + { + "epoch": 1.961661341853035, + "grad_norm": 0.10264059901237488, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2456 + }, + { + "epoch": 1.9624600638977636, + "grad_norm": 0.12051466107368469, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2457 + }, + { + "epoch": 1.9632587859424921, + "grad_norm": 0.27400559186935425, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2458 + }, + { + "epoch": 1.9640575079872205, + "grad_norm": 0.2756473124027252, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2459 + }, + { + "epoch": 1.9648562300319488, + "grad_norm": 0.09925543516874313, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2460 + }, + { + "epoch": 1.9656549520766773, + "grad_norm": 0.18176420032978058, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2461 + }, + { + "epoch": 1.9664536741214058, + "grad_norm": 0.353693425655365, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 2462 + }, + { + "epoch": 1.9672523961661343, + "grad_norm": 0.30674099922180176, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2463 + }, + { + "epoch": 1.9680511182108626, + "grad_norm": 0.04689846560359001, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2464 + }, + { + "epoch": 1.968849840255591, + "grad_norm": 0.29758918285369873, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2465 + }, + { + "epoch": 1.9696485623003195, + "grad_norm": 0.363922655582428, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2466 + }, + { + "epoch": 1.970447284345048, + "grad_norm": 0.19258317351341248, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2467 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 0.10317967087030411, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2468 + }, + { + "epoch": 1.9720447284345048, + "grad_norm": 0.2375856637954712, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2469 + }, + { + "epoch": 1.9728434504792332, + "grad_norm": 0.13130125403404236, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2470 + }, + { + "epoch": 1.9736421725239617, + "grad_norm": 0.08131767064332962, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2471 + }, + { + "epoch": 1.9744408945686902, + "grad_norm": 0.14860530197620392, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2472 + }, + { + "epoch": 1.9752396166134185, + "grad_norm": 0.11777997016906738, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2473 + }, + { + "epoch": 1.9760383386581468, + "grad_norm": 0.08397025614976883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2474 + }, + { + "epoch": 1.9768370607028753, + "grad_norm": 0.08824057132005692, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2475 + }, + { + "epoch": 1.9776357827476039, + "grad_norm": 0.06647378206253052, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2476 + }, + { + "epoch": 1.9784345047923324, + "grad_norm": 0.038043633103370667, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2477 + }, + { + "epoch": 1.9792332268370607, + "grad_norm": 0.08245793730020523, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2478 + }, + { + "epoch": 1.980031948881789, + "grad_norm": 0.1402815282344818, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2479 + }, + { + "epoch": 1.9808306709265175, + "grad_norm": 0.15749140083789825, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2480 + }, + { + "epoch": 1.981629392971246, + "grad_norm": 0.09396994858980179, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2481 + }, + { + "epoch": 1.9824281150159746, + "grad_norm": 0.0725923553109169, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2482 + }, + { + "epoch": 1.983226837060703, + "grad_norm": 0.06790316104888916, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2483 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 0.04050496965646744, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2484 + }, + { + "epoch": 1.9848242811501597, + "grad_norm": 0.04245828837156296, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2485 + }, + { + "epoch": 1.9856230031948883, + "grad_norm": 0.04818668216466904, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2486 + }, + { + "epoch": 1.9864217252396166, + "grad_norm": 0.07091481238603592, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2487 + }, + { + "epoch": 1.9872204472843449, + "grad_norm": 0.08975768834352493, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2488 + }, + { + "epoch": 1.9880191693290734, + "grad_norm": 0.0920509397983551, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2489 + }, + { + "epoch": 1.988817891373802, + "grad_norm": 0.06188343092799187, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2490 + }, + { + "epoch": 1.9896166134185305, + "grad_norm": 0.03998660668730736, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2491 + }, + { + "epoch": 1.9904153354632588, + "grad_norm": 0.03859339654445648, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2492 + }, + { + "epoch": 1.991214057507987, + "grad_norm": 0.050228461623191833, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2493 + }, + { + "epoch": 1.9920127795527156, + "grad_norm": 0.04037710279226303, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2494 + }, + { + "epoch": 1.9928115015974441, + "grad_norm": 0.04584654048085213, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2495 + }, + { + "epoch": 1.9936102236421727, + "grad_norm": 0.03696245700120926, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2496 + }, + { + "epoch": 1.994408945686901, + "grad_norm": 0.04600491747260094, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2497 + }, + { + "epoch": 1.9952076677316293, + "grad_norm": 0.0943571925163269, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2498 + }, + { + "epoch": 1.9960063897763578, + "grad_norm": 0.11350230127573013, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2499 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.09816325455904007, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2500 + }, + { + "epoch": 1.9976038338658149, + "grad_norm": 0.05887974426150322, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2501 + }, + { + "epoch": 1.9984025559105432, + "grad_norm": 0.039232514798641205, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2502 + }, + { + "epoch": 1.9992012779552715, + "grad_norm": 0.10776908695697784, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2503 + }, + { + "epoch": 2.0, + "grad_norm": 0.1708499789237976, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2504 + }, + { + "epoch": 2.0007987220447285, + "grad_norm": 0.12712575495243073, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2505 + }, + { + "epoch": 2.001597444089457, + "grad_norm": 0.04130035266280174, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2506 + }, + { + "epoch": 2.002396166134185, + "grad_norm": 0.08062197268009186, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2507 + }, + { + "epoch": 2.0031948881789137, + "grad_norm": 0.11429931968450546, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2508 + }, + { + "epoch": 2.003993610223642, + "grad_norm": 0.06290867924690247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2509 + }, + { + "epoch": 2.0047923322683707, + "grad_norm": 0.043735455721616745, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2510 + }, + { + "epoch": 2.0055910543130993, + "grad_norm": 0.08331973850727081, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2511 + }, + { + "epoch": 2.0063897763578273, + "grad_norm": 0.07424676418304443, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2512 + }, + { + "epoch": 2.007188498402556, + "grad_norm": 0.0450097881257534, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2513 + }, + { + "epoch": 2.0079872204472844, + "grad_norm": 0.05486248433589935, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2514 + }, + { + "epoch": 2.008785942492013, + "grad_norm": 0.03456762805581093, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2515 + }, + { + "epoch": 2.009584664536741, + "grad_norm": 0.060457173734903336, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2516 + }, + { + "epoch": 2.0103833865814695, + "grad_norm": 0.11361896246671677, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2517 + }, + { + "epoch": 2.011182108626198, + "grad_norm": 0.13272768259048462, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2518 + }, + { + "epoch": 2.0119808306709266, + "grad_norm": 0.06579867750406265, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2519 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.06989869475364685, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2520 + }, + { + "epoch": 2.013578274760383, + "grad_norm": 0.10227718949317932, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2521 + }, + { + "epoch": 2.0143769968051117, + "grad_norm": 0.1155320331454277, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2522 + }, + { + "epoch": 2.0151757188498403, + "grad_norm": 0.08428250998258591, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2523 + }, + { + "epoch": 2.015974440894569, + "grad_norm": 0.07322479784488678, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2524 + }, + { + "epoch": 2.0167731629392973, + "grad_norm": 0.0683116540312767, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2525 + }, + { + "epoch": 2.0175718849840254, + "grad_norm": 0.05594201013445854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2526 + }, + { + "epoch": 2.018370607028754, + "grad_norm": 0.08582351356744766, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2527 + }, + { + "epoch": 2.0191693290734825, + "grad_norm": 0.16223077476024628, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2528 + }, + { + "epoch": 2.019968051118211, + "grad_norm": 0.23563791811466217, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2529 + }, + { + "epoch": 2.0207667731629395, + "grad_norm": 0.2101173847913742, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2530 + }, + { + "epoch": 2.0215654952076676, + "grad_norm": 0.14453741908073425, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2531 + }, + { + "epoch": 2.022364217252396, + "grad_norm": 0.050489380955696106, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2532 + }, + { + "epoch": 2.0231629392971247, + "grad_norm": 0.17723125219345093, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2533 + }, + { + "epoch": 2.023961661341853, + "grad_norm": 0.18600088357925415, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2534 + }, + { + "epoch": 2.0247603833865813, + "grad_norm": 0.10898424685001373, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2535 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.07256787270307541, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2536 + }, + { + "epoch": 2.0263578274760383, + "grad_norm": 0.1978672444820404, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2537 + }, + { + "epoch": 2.027156549520767, + "grad_norm": 0.20623594522476196, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2538 + }, + { + "epoch": 2.0279552715654954, + "grad_norm": 0.08837094157934189, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2539 + }, + { + "epoch": 2.0287539936102235, + "grad_norm": 0.10977557301521301, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2540 + }, + { + "epoch": 2.029552715654952, + "grad_norm": 0.24850067496299744, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2541 + }, + { + "epoch": 2.0303514376996805, + "grad_norm": 0.29207590222358704, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2542 + }, + { + "epoch": 2.031150159744409, + "grad_norm": 0.1985940933227539, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2543 + }, + { + "epoch": 2.0319488817891376, + "grad_norm": 0.04519326612353325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2544 + }, + { + "epoch": 2.0327476038338657, + "grad_norm": 0.16939495503902435, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2545 + }, + { + "epoch": 2.033546325878594, + "grad_norm": 0.270275354385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2546 + }, + { + "epoch": 2.0343450479233227, + "grad_norm": 0.21180108189582825, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2547 + }, + { + "epoch": 2.0351437699680512, + "grad_norm": 0.0469316728413105, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2548 + }, + { + "epoch": 2.0359424920127798, + "grad_norm": 0.1845361739397049, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2549 + }, + { + "epoch": 2.036741214057508, + "grad_norm": 0.2276308536529541, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2550 + }, + { + "epoch": 2.0375399361022364, + "grad_norm": 0.11676277965307236, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2551 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 0.1021813154220581, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2552 + }, + { + "epoch": 2.0391373801916934, + "grad_norm": 0.28504467010498047, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2553 + }, + { + "epoch": 2.0399361022364215, + "grad_norm": 0.2821798324584961, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2554 + }, + { + "epoch": 2.04073482428115, + "grad_norm": 0.09673242270946503, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2555 + }, + { + "epoch": 2.0415335463258786, + "grad_norm": 0.1784241944551468, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2556 + }, + { + "epoch": 2.042332268370607, + "grad_norm": 0.30749815702438354, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2557 + }, + { + "epoch": 2.0431309904153356, + "grad_norm": 0.2625802457332611, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2558 + }, + { + "epoch": 2.0439297124600637, + "grad_norm": 0.0651462972164154, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 2559 + }, + { + "epoch": 2.0447284345047922, + "grad_norm": 0.2103819102048874, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 2560 + }, + { + "epoch": 2.0455271565495208, + "grad_norm": 0.2854102849960327, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2561 + }, + { + "epoch": 2.0463258785942493, + "grad_norm": 0.14184293150901794, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2562 + }, + { + "epoch": 2.047124600638978, + "grad_norm": 0.06151473522186279, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2563 + }, + { + "epoch": 2.047923322683706, + "grad_norm": 0.1858600378036499, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2564 + }, + { + "epoch": 2.0487220447284344, + "grad_norm": 0.19997341930866241, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2565 + }, + { + "epoch": 2.049520766773163, + "grad_norm": 0.0924893170595169, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2566 + }, + { + "epoch": 2.0503194888178915, + "grad_norm": 0.14571507275104523, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2567 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.2566513121128082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2568 + }, + { + "epoch": 2.051916932907348, + "grad_norm": 0.24462486803531647, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2569 + }, + { + "epoch": 2.0527156549520766, + "grad_norm": 0.10544434189796448, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2570 + }, + { + "epoch": 2.053514376996805, + "grad_norm": 0.08675491809844971, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2571 + }, + { + "epoch": 2.0543130990415337, + "grad_norm": 0.18398417532444, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2572 + }, + { + "epoch": 2.055111821086262, + "grad_norm": 0.15167878568172455, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2573 + }, + { + "epoch": 2.0559105431309903, + "grad_norm": 0.06932301074266434, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2574 + }, + { + "epoch": 2.056709265175719, + "grad_norm": 0.06368319690227509, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2575 + }, + { + "epoch": 2.0575079872204474, + "grad_norm": 0.11785905808210373, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2576 + }, + { + "epoch": 2.058306709265176, + "grad_norm": 0.05494855344295502, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2577 + }, + { + "epoch": 2.059105431309904, + "grad_norm": 0.10618741810321808, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2578 + }, + { + "epoch": 2.0599041533546325, + "grad_norm": 0.14729735255241394, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2579 + }, + { + "epoch": 2.060702875399361, + "grad_norm": 0.08014677464962006, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2580 + }, + { + "epoch": 2.0615015974440896, + "grad_norm": 0.07460471242666245, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2581 + }, + { + "epoch": 2.062300319488818, + "grad_norm": 0.12884479761123657, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2582 + }, + { + "epoch": 2.063099041533546, + "grad_norm": 0.11224616318941116, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2583 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.06026687100529671, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2584 + }, + { + "epoch": 2.0646964856230032, + "grad_norm": 0.06690093874931335, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2585 + }, + { + "epoch": 2.0654952076677318, + "grad_norm": 0.10095079988241196, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2586 + }, + { + "epoch": 2.06629392971246, + "grad_norm": 0.08353506028652191, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2587 + }, + { + "epoch": 2.0670926517571884, + "grad_norm": 0.07060668617486954, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2588 + }, + { + "epoch": 2.067891373801917, + "grad_norm": 0.07298587262630463, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2589 + }, + { + "epoch": 2.0686900958466454, + "grad_norm": 0.04319034889340401, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2590 + }, + { + "epoch": 2.069488817891374, + "grad_norm": 0.04229504242539406, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2591 + }, + { + "epoch": 2.070287539936102, + "grad_norm": 0.05476998910307884, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2592 + }, + { + "epoch": 2.0710862619808306, + "grad_norm": 0.039188139140605927, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2593 + }, + { + "epoch": 2.071884984025559, + "grad_norm": 0.058993417769670486, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2594 + }, + { + "epoch": 2.0726837060702876, + "grad_norm": 0.04871759191155434, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2595 + }, + { + "epoch": 2.073482428115016, + "grad_norm": 0.037119925022125244, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2596 + }, + { + "epoch": 2.0742811501597442, + "grad_norm": 0.06476760655641556, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2597 + }, + { + "epoch": 2.0750798722044728, + "grad_norm": 0.03558475151658058, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2598 + }, + { + "epoch": 2.0758785942492013, + "grad_norm": 0.03988872841000557, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2599 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.04446236789226532, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2600 + }, + { + "epoch": 2.0774760383386583, + "grad_norm": 0.058075740933418274, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2601 + }, + { + "epoch": 2.0782747603833864, + "grad_norm": 0.10492820292711258, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2602 + }, + { + "epoch": 2.079073482428115, + "grad_norm": 0.1374005526304245, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2603 + }, + { + "epoch": 2.0798722044728435, + "grad_norm": 0.10932788252830505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 2604 + }, + { + "epoch": 2.080670926517572, + "grad_norm": 0.035826049745082855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2605 + }, + { + "epoch": 2.0814696485623, + "grad_norm": 0.10934802889823914, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2606 + }, + { + "epoch": 2.0822683706070286, + "grad_norm": 0.13302485644817352, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2607 + }, + { + "epoch": 2.083067092651757, + "grad_norm": 0.11253390461206436, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2608 + }, + { + "epoch": 2.0838658146964857, + "grad_norm": 0.04634593054652214, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2609 + }, + { + "epoch": 2.084664536741214, + "grad_norm": 0.21137909591197968, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2610 + }, + { + "epoch": 2.0854632587859423, + "grad_norm": 0.2771414816379547, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2611 + }, + { + "epoch": 2.086261980830671, + "grad_norm": 0.1959906965494156, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2612 + }, + { + "epoch": 2.0870607028753994, + "grad_norm": 0.042694322764873505, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2613 + }, + { + "epoch": 2.087859424920128, + "grad_norm": 0.15753871202468872, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2614 + }, + { + "epoch": 2.0886581469648564, + "grad_norm": 0.1917339563369751, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2615 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.05056089907884598, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2616 + }, + { + "epoch": 2.090255591054313, + "grad_norm": 0.16167999804019928, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2617 + }, + { + "epoch": 2.0910543130990416, + "grad_norm": 0.21019205451011658, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2618 + }, + { + "epoch": 2.09185303514377, + "grad_norm": 0.12859253585338593, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2619 + }, + { + "epoch": 2.0926517571884986, + "grad_norm": 0.04561556130647659, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2620 + }, + { + "epoch": 2.0934504792332267, + "grad_norm": 0.19915086030960083, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2621 + }, + { + "epoch": 2.094249201277955, + "grad_norm": 0.2792043685913086, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2622 + }, + { + "epoch": 2.0950479233226837, + "grad_norm": 0.16861289739608765, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2623 + }, + { + "epoch": 2.0958466453674123, + "grad_norm": 0.08431511372327805, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2624 + }, + { + "epoch": 2.0966453674121404, + "grad_norm": 0.26860734820365906, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2625 + }, + { + "epoch": 2.097444089456869, + "grad_norm": 0.2949545979499817, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2626 + }, + { + "epoch": 2.0982428115015974, + "grad_norm": 0.12639857828617096, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2627 + }, + { + "epoch": 2.099041533546326, + "grad_norm": 0.14675533771514893, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2628 + }, + { + "epoch": 2.0998402555910545, + "grad_norm": 0.29298654198646545, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2629 + }, + { + "epoch": 2.1006389776357826, + "grad_norm": 0.20049460232257843, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2630 + }, + { + "epoch": 2.101437699680511, + "grad_norm": 0.05280651897192001, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2631 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.2405036836862564, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2632 + }, + { + "epoch": 2.103035143769968, + "grad_norm": 0.29925718903541565, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2633 + }, + { + "epoch": 2.1038338658146967, + "grad_norm": 0.1330690085887909, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2634 + }, + { + "epoch": 2.1046325878594248, + "grad_norm": 0.11366300284862518, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2635 + }, + { + "epoch": 2.1054313099041533, + "grad_norm": 0.184611514210701, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2636 + }, + { + "epoch": 2.106230031948882, + "grad_norm": 0.0942547619342804, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2637 + }, + { + "epoch": 2.1070287539936103, + "grad_norm": 0.09224486351013184, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2638 + }, + { + "epoch": 2.107827476038339, + "grad_norm": 0.2167433351278305, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2639 + }, + { + "epoch": 2.108626198083067, + "grad_norm": 0.20001453161239624, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2640 + }, + { + "epoch": 2.1094249201277955, + "grad_norm": 0.0551394522190094, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2641 + }, + { + "epoch": 2.110223642172524, + "grad_norm": 0.14991897344589233, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2642 + }, + { + "epoch": 2.1110223642172525, + "grad_norm": 0.21038007736206055, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2643 + }, + { + "epoch": 2.1118210862619806, + "grad_norm": 0.11942024528980255, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2644 + }, + { + "epoch": 2.112619808306709, + "grad_norm": 0.14938029646873474, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2645 + }, + { + "epoch": 2.1134185303514377, + "grad_norm": 0.3405923843383789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2646 + }, + { + "epoch": 2.114217252396166, + "grad_norm": 0.3363925814628601, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2647 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.12379220873117447, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2648 + }, + { + "epoch": 2.115814696485623, + "grad_norm": 0.1583731323480606, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2649 + }, + { + "epoch": 2.1166134185303513, + "grad_norm": 0.2941076457500458, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2650 + }, + { + "epoch": 2.11741214057508, + "grad_norm": 0.18513287603855133, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2651 + }, + { + "epoch": 2.1182108626198084, + "grad_norm": 0.057797662913799286, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2652 + }, + { + "epoch": 2.119009584664537, + "grad_norm": 0.12461342662572861, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2653 + }, + { + "epoch": 2.119808306709265, + "grad_norm": 0.06276709586381912, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2654 + }, + { + "epoch": 2.1206070287539935, + "grad_norm": 0.06073528528213501, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2655 + }, + { + "epoch": 2.121405750798722, + "grad_norm": 0.07055814564228058, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2656 + }, + { + "epoch": 2.1222044728434506, + "grad_norm": 0.03508429974317551, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2657 + }, + { + "epoch": 2.123003194888179, + "grad_norm": 0.0474206916987896, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2658 + }, + { + "epoch": 2.123801916932907, + "grad_norm": 0.04067448526620865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2659 + }, + { + "epoch": 2.1246006389776357, + "grad_norm": 0.060025133192539215, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2660 + }, + { + "epoch": 2.1253993610223643, + "grad_norm": 0.061696235090494156, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2661 + }, + { + "epoch": 2.126198083067093, + "grad_norm": 0.060907844454050064, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2662 + }, + { + "epoch": 2.126996805111821, + "grad_norm": 0.06122025474905968, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 2663 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.06885300576686859, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2664 + }, + { + "epoch": 2.128594249201278, + "grad_norm": 0.047428976744413376, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2665 + }, + { + "epoch": 2.1293929712460065, + "grad_norm": 0.036644674837589264, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2666 + }, + { + "epoch": 2.130191693290735, + "grad_norm": 0.04983266070485115, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2667 + }, + { + "epoch": 2.130990415335463, + "grad_norm": 0.09072417765855789, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2668 + }, + { + "epoch": 2.1317891373801916, + "grad_norm": 0.10644412785768509, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2669 + }, + { + "epoch": 2.13258785942492, + "grad_norm": 0.07350479066371918, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2670 + }, + { + "epoch": 2.1333865814696487, + "grad_norm": 0.041709840297698975, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2671 + }, + { + "epoch": 2.134185303514377, + "grad_norm": 0.043592557311058044, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2672 + }, + { + "epoch": 2.1349840255591053, + "grad_norm": 0.04548558592796326, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2673 + }, + { + "epoch": 2.135782747603834, + "grad_norm": 0.03937267139554024, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2674 + }, + { + "epoch": 2.1365814696485623, + "grad_norm": 0.05674131214618683, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2675 + }, + { + "epoch": 2.137380191693291, + "grad_norm": 0.0857989713549614, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2676 + }, + { + "epoch": 2.1381789137380194, + "grad_norm": 0.12659871578216553, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2677 + }, + { + "epoch": 2.1389776357827475, + "grad_norm": 0.10000529885292053, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2678 + }, + { + "epoch": 2.139776357827476, + "grad_norm": 0.060805950313806534, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2679 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.20407895743846893, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2680 + }, + { + "epoch": 2.141373801916933, + "grad_norm": 0.21931609511375427, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2681 + }, + { + "epoch": 2.142172523961661, + "grad_norm": 0.0947318896651268, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2682 + }, + { + "epoch": 2.1429712460063897, + "grad_norm": 0.10082453489303589, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2683 + }, + { + "epoch": 2.143769968051118, + "grad_norm": 0.2510482370853424, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2684 + }, + { + "epoch": 2.1445686900958467, + "grad_norm": 0.2802210748195648, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 2685 + }, + { + "epoch": 2.1453674121405752, + "grad_norm": 0.18770602345466614, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2686 + }, + { + "epoch": 2.1461661341853033, + "grad_norm": 0.048588722944259644, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2687 + }, + { + "epoch": 2.146964856230032, + "grad_norm": 0.1443304419517517, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2688 + }, + { + "epoch": 2.1477635782747604, + "grad_norm": 0.22439543902873993, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2689 + }, + { + "epoch": 2.148562300319489, + "grad_norm": 0.16312581300735474, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2690 + }, + { + "epoch": 2.1493610223642174, + "grad_norm": 0.08721408247947693, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2691 + }, + { + "epoch": 2.1501597444089455, + "grad_norm": 0.2756902873516083, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2692 + }, + { + "epoch": 2.150958466453674, + "grad_norm": 0.2834199070930481, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2693 + }, + { + "epoch": 2.1517571884984026, + "grad_norm": 0.1190086081624031, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2694 + }, + { + "epoch": 2.152555910543131, + "grad_norm": 0.1246909499168396, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2695 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.2244880348443985, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2696 + }, + { + "epoch": 2.1541533546325877, + "grad_norm": 0.1424233317375183, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2697 + }, + { + "epoch": 2.1549520766773163, + "grad_norm": 0.10756697505712509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2698 + }, + { + "epoch": 2.155750798722045, + "grad_norm": 0.1688450276851654, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2699 + }, + { + "epoch": 2.1565495207667733, + "grad_norm": 0.12139362096786499, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 2700 + }, + { + "epoch": 2.1573482428115014, + "grad_norm": 0.07833441346883774, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 2701 + }, + { + "epoch": 2.15814696485623, + "grad_norm": 0.22099994122982025, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2702 + }, + { + "epoch": 2.1589456869009584, + "grad_norm": 0.190511554479599, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2703 + }, + { + "epoch": 2.159744408945687, + "grad_norm": 0.07637764513492584, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2704 + }, + { + "epoch": 2.1605431309904155, + "grad_norm": 0.06381702423095703, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2705 + }, + { + "epoch": 2.1613418530351436, + "grad_norm": 0.1343991458415985, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2706 + }, + { + "epoch": 2.162140575079872, + "grad_norm": 0.13090470433235168, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2707 + }, + { + "epoch": 2.1629392971246006, + "grad_norm": 0.04627209156751633, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2708 + }, + { + "epoch": 2.163738019169329, + "grad_norm": 0.060849517583847046, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2709 + }, + { + "epoch": 2.1645367412140577, + "grad_norm": 0.06780707836151123, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2710 + }, + { + "epoch": 2.165335463258786, + "grad_norm": 0.07282490283250809, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 2711 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 0.07168543338775635, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2712 + }, + { + "epoch": 2.166932907348243, + "grad_norm": 0.08716403692960739, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2713 + }, + { + "epoch": 2.1677316293929714, + "grad_norm": 0.09366965293884277, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2714 + }, + { + "epoch": 2.1685303514377, + "grad_norm": 0.09121392667293549, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2715 + }, + { + "epoch": 2.169329073482428, + "grad_norm": 0.06912577152252197, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2716 + }, + { + "epoch": 2.1701277955271565, + "grad_norm": 0.046476542949676514, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2717 + }, + { + "epoch": 2.170926517571885, + "grad_norm": 0.04065564647316933, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2718 + }, + { + "epoch": 2.1717252396166136, + "grad_norm": 0.044998086988925934, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2719 + }, + { + "epoch": 2.1725239616613417, + "grad_norm": 0.04588993638753891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2720 + }, + { + "epoch": 2.17332268370607, + "grad_norm": 0.05954091623425484, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2721 + }, + { + "epoch": 2.1741214057507987, + "grad_norm": 0.07627220451831818, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2722 + }, + { + "epoch": 2.1749201277955272, + "grad_norm": 0.0832771435379982, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2723 + }, + { + "epoch": 2.1757188498402558, + "grad_norm": 0.09901522845029831, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2724 + }, + { + "epoch": 2.176517571884984, + "grad_norm": 0.05773104354739189, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2725 + }, + { + "epoch": 2.1773162939297124, + "grad_norm": 0.0783318281173706, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2726 + }, + { + "epoch": 2.178115015974441, + "grad_norm": 0.12447014451026917, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2727 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.08944697678089142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2728 + }, + { + "epoch": 2.179712460063898, + "grad_norm": 0.07295451313257217, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2729 + }, + { + "epoch": 2.180511182108626, + "grad_norm": 0.1335693746805191, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2730 + }, + { + "epoch": 2.1813099041533546, + "grad_norm": 0.14618094265460968, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2731 + }, + { + "epoch": 2.182108626198083, + "grad_norm": 0.05047796294093132, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2732 + }, + { + "epoch": 2.1829073482428116, + "grad_norm": 0.18955212831497192, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2733 + }, + { + "epoch": 2.18370607028754, + "grad_norm": 0.3394540250301361, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2734 + }, + { + "epoch": 2.1845047923322682, + "grad_norm": 0.34607887268066406, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2735 + }, + { + "epoch": 2.1853035143769968, + "grad_norm": 0.19489939510822296, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2736 + }, + { + "epoch": 2.1861022364217253, + "grad_norm": 0.06775379180908203, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2737 + }, + { + "epoch": 2.186900958466454, + "grad_norm": 0.2376859039068222, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2738 + }, + { + "epoch": 2.187699680511182, + "grad_norm": 0.22686026990413666, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2739 + }, + { + "epoch": 2.1884984025559104, + "grad_norm": 0.059437282383441925, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2740 + }, + { + "epoch": 2.189297124600639, + "grad_norm": 0.184672549366951, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2741 + }, + { + "epoch": 2.1900958466453675, + "grad_norm": 0.21975156664848328, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2742 + }, + { + "epoch": 2.190894568690096, + "grad_norm": 0.08795829117298126, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2743 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.1045440062880516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2744 + }, + { + "epoch": 2.1924920127795526, + "grad_norm": 0.21037985384464264, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2745 + }, + { + "epoch": 2.193290734824281, + "grad_norm": 0.17791713774204254, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2746 + }, + { + "epoch": 2.1940894568690097, + "grad_norm": 0.06028178334236145, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2747 + }, + { + "epoch": 2.194888178913738, + "grad_norm": 0.0801217257976532, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2748 + }, + { + "epoch": 2.1956869009584663, + "grad_norm": 0.11564524471759796, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2749 + }, + { + "epoch": 2.196485623003195, + "grad_norm": 0.0652003139257431, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2750 + }, + { + "epoch": 2.1972843450479234, + "grad_norm": 0.057818979024887085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2751 + }, + { + "epoch": 2.198083067092652, + "grad_norm": 0.10466332733631134, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2752 + }, + { + "epoch": 2.1988817891373804, + "grad_norm": 0.09350129216909409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2753 + }, + { + "epoch": 2.1996805111821085, + "grad_norm": 0.04295926168560982, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2754 + }, + { + "epoch": 2.200479233226837, + "grad_norm": 0.0851534903049469, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2755 + }, + { + "epoch": 2.2012779552715656, + "grad_norm": 0.1857217401266098, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2756 + }, + { + "epoch": 2.202076677316294, + "grad_norm": 0.18267984688282013, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2757 + }, + { + "epoch": 2.202875399361022, + "grad_norm": 0.07249841094017029, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2758 + }, + { + "epoch": 2.2036741214057507, + "grad_norm": 0.14335495233535767, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2759 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.24338914453983307, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2760 + }, + { + "epoch": 2.2052715654952078, + "grad_norm": 0.17772778868675232, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2761 + }, + { + "epoch": 2.2060702875399363, + "grad_norm": 0.04809113219380379, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2762 + }, + { + "epoch": 2.2068690095846644, + "grad_norm": 0.09682228416204453, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2763 + }, + { + "epoch": 2.207667731629393, + "grad_norm": 0.13868102431297302, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2764 + }, + { + "epoch": 2.2084664536741214, + "grad_norm": 0.10956277698278427, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2765 + }, + { + "epoch": 2.20926517571885, + "grad_norm": 0.06163526698946953, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2766 + }, + { + "epoch": 2.2100638977635785, + "grad_norm": 0.14519700407981873, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2767 + }, + { + "epoch": 2.2108626198083066, + "grad_norm": 0.12486071139574051, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2768 + }, + { + "epoch": 2.211661341853035, + "grad_norm": 0.0414549857378006, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2769 + }, + { + "epoch": 2.2124600638977636, + "grad_norm": 0.13828913867473602, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2770 + }, + { + "epoch": 2.213258785942492, + "grad_norm": 0.18277914822101593, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2771 + }, + { + "epoch": 2.2140575079872207, + "grad_norm": 0.15727964043617249, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2772 + }, + { + "epoch": 2.2148562300319488, + "grad_norm": 0.07437993586063385, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2773 + }, + { + "epoch": 2.2156549520766773, + "grad_norm": 0.08192550390958786, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2774 + }, + { + "epoch": 2.216453674121406, + "grad_norm": 0.1804617941379547, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2775 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.18431466817855835, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2776 + }, + { + "epoch": 2.2180511182108624, + "grad_norm": 0.11281057447195053, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2777 + }, + { + "epoch": 2.218849840255591, + "grad_norm": 0.0398496650159359, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2778 + }, + { + "epoch": 2.2196485623003195, + "grad_norm": 0.16930198669433594, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2779 + }, + { + "epoch": 2.220447284345048, + "grad_norm": 0.2384660542011261, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2780 + }, + { + "epoch": 2.2212460063897765, + "grad_norm": 0.18867406249046326, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2781 + }, + { + "epoch": 2.2220447284345046, + "grad_norm": 0.041189488023519516, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2782 + }, + { + "epoch": 2.222843450479233, + "grad_norm": 0.21946212649345398, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2783 + }, + { + "epoch": 2.2236421725239617, + "grad_norm": 0.3394725024700165, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2784 + }, + { + "epoch": 2.22444089456869, + "grad_norm": 0.09503358602523804, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2785 + }, + { + "epoch": 2.2252396166134187, + "grad_norm": 0.180524080991745, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2786 + }, + { + "epoch": 2.226038338658147, + "grad_norm": 0.2961865961551666, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2787 + }, + { + "epoch": 2.2268370607028753, + "grad_norm": 0.25913500785827637, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2788 + }, + { + "epoch": 2.227635782747604, + "grad_norm": 0.08123381435871124, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2789 + }, + { + "epoch": 2.2284345047923324, + "grad_norm": 0.18587692081928253, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2790 + }, + { + "epoch": 2.229233226837061, + "grad_norm": 0.29838815331459045, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2791 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.2115599811077118, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2792 + }, + { + "epoch": 2.2308306709265175, + "grad_norm": 0.04708286374807358, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2793 + }, + { + "epoch": 2.231629392971246, + "grad_norm": 0.224795401096344, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2794 + }, + { + "epoch": 2.2324281150159746, + "grad_norm": 0.2673366665840149, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2795 + }, + { + "epoch": 2.2332268370607027, + "grad_norm": 0.1223720833659172, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2796 + }, + { + "epoch": 2.234025559105431, + "grad_norm": 0.12798862159252167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2797 + }, + { + "epoch": 2.2348242811501597, + "grad_norm": 0.25721317529678345, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2798 + }, + { + "epoch": 2.2356230031948883, + "grad_norm": 0.16970157623291016, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2799 + }, + { + "epoch": 2.236421725239617, + "grad_norm": 0.1311950534582138, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 2800 + }, + { + "epoch": 2.237220447284345, + "grad_norm": 0.32154732942581177, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2801 + }, + { + "epoch": 2.2380191693290734, + "grad_norm": 0.23601645231246948, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2802 + }, + { + "epoch": 2.238817891373802, + "grad_norm": 0.08307314664125443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2803 + }, + { + "epoch": 2.2396166134185305, + "grad_norm": 0.31183329224586487, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 2804 + }, + { + "epoch": 2.2404153354632586, + "grad_norm": 0.27391767501831055, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2805 + }, + { + "epoch": 2.241214057507987, + "grad_norm": 0.07247646898031235, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2806 + }, + { + "epoch": 2.2420127795527156, + "grad_norm": 0.1882690042257309, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2807 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.18179158866405487, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2808 + }, + { + "epoch": 2.2436102236421727, + "grad_norm": 0.10761548578739166, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2809 + }, + { + "epoch": 2.244408945686901, + "grad_norm": 0.3067700266838074, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2810 + }, + { + "epoch": 2.2452076677316293, + "grad_norm": 0.17450691759586334, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 2811 + }, + { + "epoch": 2.246006389776358, + "grad_norm": 0.14480780065059662, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2812 + }, + { + "epoch": 2.2468051118210863, + "grad_norm": 0.3325321078300476, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2813 + }, + { + "epoch": 2.247603833865815, + "grad_norm": 0.26238250732421875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2814 + }, + { + "epoch": 2.248402555910543, + "grad_norm": 0.07829522341489792, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2815 + }, + { + "epoch": 2.2492012779552715, + "grad_norm": 0.269721657037735, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2816 + }, + { + "epoch": 2.25, + "grad_norm": 0.16362956166267395, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2817 + }, + { + "epoch": 2.2507987220447285, + "grad_norm": 0.08129733055830002, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2818 + }, + { + "epoch": 2.251597444089457, + "grad_norm": 0.18430721759796143, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2819 + }, + { + "epoch": 2.252396166134185, + "grad_norm": 0.09634844213724136, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2820 + }, + { + "epoch": 2.2531948881789137, + "grad_norm": 0.08204549551010132, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2821 + }, + { + "epoch": 2.253993610223642, + "grad_norm": 0.1140882819890976, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2822 + }, + { + "epoch": 2.2547923322683707, + "grad_norm": 0.05056345462799072, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2823 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.06505320966243744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2824 + }, + { + "epoch": 2.2563897763578273, + "grad_norm": 0.11316727101802826, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2825 + }, + { + "epoch": 2.257188498402556, + "grad_norm": 0.1036633774638176, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2826 + }, + { + "epoch": 2.2579872204472844, + "grad_norm": 0.0470670685172081, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 2827 + }, + { + "epoch": 2.258785942492013, + "grad_norm": 0.0880327895283699, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2828 + }, + { + "epoch": 2.2595846645367414, + "grad_norm": 0.07664912939071655, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2829 + }, + { + "epoch": 2.2603833865814695, + "grad_norm": 0.049471575766801834, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2830 + }, + { + "epoch": 2.261182108626198, + "grad_norm": 0.04288775101304054, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2831 + }, + { + "epoch": 2.2619808306709266, + "grad_norm": 0.10124537348747253, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2832 + }, + { + "epoch": 2.262779552715655, + "grad_norm": 0.13865061104297638, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2833 + }, + { + "epoch": 2.263578274760383, + "grad_norm": 0.10227467864751816, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2834 + }, + { + "epoch": 2.2643769968051117, + "grad_norm": 0.050575822591781616, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 2835 + }, + { + "epoch": 2.2651757188498403, + "grad_norm": 0.044946715235710144, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2836 + }, + { + "epoch": 2.265974440894569, + "grad_norm": 0.0712895616889, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2837 + }, + { + "epoch": 2.2667731629392973, + "grad_norm": 0.07044374942779541, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2838 + }, + { + "epoch": 2.2675718849840254, + "grad_norm": 0.04518461972475052, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2839 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 0.05259617418050766, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2840 + }, + { + "epoch": 2.2691693290734825, + "grad_norm": 0.0654863640666008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2841 + }, + { + "epoch": 2.269968051118211, + "grad_norm": 0.04345248267054558, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2842 + }, + { + "epoch": 2.270766773162939, + "grad_norm": 0.057224296033382416, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2843 + }, + { + "epoch": 2.2715654952076676, + "grad_norm": 0.11091717332601547, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2844 + }, + { + "epoch": 2.272364217252396, + "grad_norm": 0.11426062136888504, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2845 + }, + { + "epoch": 2.2731629392971247, + "grad_norm": 0.10064966231584549, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2846 + }, + { + "epoch": 2.273961661341853, + "grad_norm": 0.13716623187065125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2847 + }, + { + "epoch": 2.2747603833865817, + "grad_norm": 0.09014318138360977, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2848 + }, + { + "epoch": 2.27555910543131, + "grad_norm": 0.16652478277683258, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2849 + }, + { + "epoch": 2.2763578274760383, + "grad_norm": 0.14217601716518402, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2850 + }, + { + "epoch": 2.277156549520767, + "grad_norm": 0.03895508497953415, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2851 + }, + { + "epoch": 2.2779552715654954, + "grad_norm": 0.17713558673858643, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2852 + }, + { + "epoch": 2.2787539936102235, + "grad_norm": 0.32960572838783264, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 2853 + }, + { + "epoch": 2.279552715654952, + "grad_norm": 0.2481910139322281, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2854 + }, + { + "epoch": 2.2803514376996805, + "grad_norm": 0.06643390655517578, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2855 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.17466357350349426, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2856 + }, + { + "epoch": 2.2819488817891376, + "grad_norm": 0.27781131863594055, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2857 + }, + { + "epoch": 2.2827476038338657, + "grad_norm": 0.19475431740283966, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2858 + }, + { + "epoch": 2.283546325878594, + "grad_norm": 0.07700221985578537, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 2859 + }, + { + "epoch": 2.2843450479233227, + "grad_norm": 0.22520926594734192, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2860 + }, + { + "epoch": 2.2851437699680512, + "grad_norm": 0.18735183775424957, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2861 + }, + { + "epoch": 2.2859424920127793, + "grad_norm": 0.04133198782801628, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2862 + }, + { + "epoch": 2.286741214057508, + "grad_norm": 0.2526150941848755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2863 + }, + { + "epoch": 2.2875399361022364, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2864 + }, + { + "epoch": 2.288338658146965, + "grad_norm": 0.12839898467063904, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2865 + }, + { + "epoch": 2.2891373801916934, + "grad_norm": 0.1259411871433258, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2866 + }, + { + "epoch": 2.289936102236422, + "grad_norm": 0.25480905175209045, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2867 + }, + { + "epoch": 2.29073482428115, + "grad_norm": 0.15650653839111328, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2868 + }, + { + "epoch": 2.2915335463258786, + "grad_norm": 0.07474946230649948, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2869 + }, + { + "epoch": 2.292332268370607, + "grad_norm": 0.170192688703537, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2870 + }, + { + "epoch": 2.2931309904153356, + "grad_norm": 0.13292376697063446, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2871 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 0.045553866773843765, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2872 + }, + { + "epoch": 2.2947284345047922, + "grad_norm": 0.10853269696235657, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2873 + }, + { + "epoch": 2.2955271565495208, + "grad_norm": 0.09945288300514221, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2874 + }, + { + "epoch": 2.2963258785942493, + "grad_norm": 0.039073117077350616, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2875 + }, + { + "epoch": 2.297124600638978, + "grad_norm": 0.05867530405521393, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2876 + }, + { + "epoch": 2.297923322683706, + "grad_norm": 0.07227179408073425, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2877 + }, + { + "epoch": 2.2987220447284344, + "grad_norm": 0.04456201195716858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2878 + }, + { + "epoch": 2.299520766773163, + "grad_norm": 0.11672481894493103, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2879 + }, + { + "epoch": 2.3003194888178915, + "grad_norm": 0.12335679680109024, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2880 + }, + { + "epoch": 2.3011182108626196, + "grad_norm": 0.043409012258052826, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2881 + }, + { + "epoch": 2.301916932907348, + "grad_norm": 0.09896806627511978, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2882 + }, + { + "epoch": 2.3027156549520766, + "grad_norm": 0.2037963569164276, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2883 + }, + { + "epoch": 2.303514376996805, + "grad_norm": 0.21378903090953827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2884 + }, + { + "epoch": 2.3043130990415337, + "grad_norm": 0.062362927943468094, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2885 + }, + { + "epoch": 2.3051118210862622, + "grad_norm": 0.17370136082172394, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2886 + }, + { + "epoch": 2.3059105431309903, + "grad_norm": 0.23190435767173767, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2887 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.08148342370986938, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2888 + }, + { + "epoch": 2.3075079872204474, + "grad_norm": 0.1596807837486267, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2889 + }, + { + "epoch": 2.308306709265176, + "grad_norm": 0.26396819949150085, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2890 + }, + { + "epoch": 2.309105431309904, + "grad_norm": 0.1509561687707901, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2891 + }, + { + "epoch": 2.3099041533546325, + "grad_norm": 0.09147104620933533, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2892 + }, + { + "epoch": 2.310702875399361, + "grad_norm": 0.23575374484062195, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2893 + }, + { + "epoch": 2.3115015974440896, + "grad_norm": 0.18403767049312592, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2894 + }, + { + "epoch": 2.312300319488818, + "grad_norm": 0.052600763738155365, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2895 + }, + { + "epoch": 2.313099041533546, + "grad_norm": 0.18707415461540222, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2896 + }, + { + "epoch": 2.3138977635782747, + "grad_norm": 0.20824143290519714, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2897 + }, + { + "epoch": 2.3146964856230032, + "grad_norm": 0.0775759220123291, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2898 + }, + { + "epoch": 2.3154952076677318, + "grad_norm": 0.10904766619205475, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2899 + }, + { + "epoch": 2.31629392971246, + "grad_norm": 0.1562514752149582, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2900 + }, + { + "epoch": 2.3170926517571884, + "grad_norm": 0.06689859926700592, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2901 + }, + { + "epoch": 2.317891373801917, + "grad_norm": 0.0887206643819809, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2902 + }, + { + "epoch": 2.3186900958466454, + "grad_norm": 0.13615944981575012, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2903 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.08094146102666855, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2904 + }, + { + "epoch": 2.3202875399361025, + "grad_norm": 0.06734368950128555, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2905 + }, + { + "epoch": 2.3210862619808306, + "grad_norm": 0.17405667901039124, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2906 + }, + { + "epoch": 2.321884984025559, + "grad_norm": 0.23022079467773438, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2907 + }, + { + "epoch": 2.3226837060702876, + "grad_norm": 0.17341896891593933, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2908 + }, + { + "epoch": 2.323482428115016, + "grad_norm": 0.037751875817775726, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2909 + }, + { + "epoch": 2.3242811501597442, + "grad_norm": 0.12434598803520203, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2910 + }, + { + "epoch": 2.3250798722044728, + "grad_norm": 0.11344511806964874, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2911 + }, + { + "epoch": 2.3258785942492013, + "grad_norm": 0.05426390469074249, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2912 + }, + { + "epoch": 2.32667731629393, + "grad_norm": 0.11261611431837082, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2913 + }, + { + "epoch": 2.3274760383386583, + "grad_norm": 0.22023531794548035, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2914 + }, + { + "epoch": 2.3282747603833864, + "grad_norm": 0.2050291895866394, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2915 + }, + { + "epoch": 2.329073482428115, + "grad_norm": 0.05478905141353607, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2916 + }, + { + "epoch": 2.3298722044728435, + "grad_norm": 0.15363283455371857, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2917 + }, + { + "epoch": 2.330670926517572, + "grad_norm": 0.17348943650722504, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2918 + }, + { + "epoch": 2.3314696485623, + "grad_norm": 0.05366649851202965, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2919 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 0.16219462454319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2920 + }, + { + "epoch": 2.333067092651757, + "grad_norm": 0.23911446332931519, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2921 + }, + { + "epoch": 2.3338658146964857, + "grad_norm": 0.12384039163589478, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2922 + }, + { + "epoch": 2.334664536741214, + "grad_norm": 0.08747945725917816, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 2923 + }, + { + "epoch": 2.3354632587859427, + "grad_norm": 0.19737359881401062, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2924 + }, + { + "epoch": 2.336261980830671, + "grad_norm": 0.11312227696180344, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2925 + }, + { + "epoch": 2.3370607028753994, + "grad_norm": 0.09944877028465271, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2926 + }, + { + "epoch": 2.337859424920128, + "grad_norm": 0.23282872140407562, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2927 + }, + { + "epoch": 2.3386581469648564, + "grad_norm": 0.14369411766529083, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2928 + }, + { + "epoch": 2.3394568690095845, + "grad_norm": 0.07267388701438904, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2929 + }, + { + "epoch": 2.340255591054313, + "grad_norm": 0.18751965463161469, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2930 + }, + { + "epoch": 2.3410543130990416, + "grad_norm": 0.20886634290218353, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2931 + }, + { + "epoch": 2.34185303514377, + "grad_norm": 0.11675436794757843, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2932 + }, + { + "epoch": 2.3426517571884986, + "grad_norm": 0.08915580064058304, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2933 + }, + { + "epoch": 2.3434504792332267, + "grad_norm": 0.1534406840801239, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2934 + }, + { + "epoch": 2.344249201277955, + "grad_norm": 0.08791724592447281, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2935 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 0.04647858813405037, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2936 + }, + { + "epoch": 2.3458466453674123, + "grad_norm": 0.09236840158700943, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2937 + }, + { + "epoch": 2.3466453674121404, + "grad_norm": 0.09079006314277649, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2938 + }, + { + "epoch": 2.347444089456869, + "grad_norm": 0.03492455556988716, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2939 + }, + { + "epoch": 2.3482428115015974, + "grad_norm": 0.11871617287397385, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2940 + }, + { + "epoch": 2.349041533546326, + "grad_norm": 0.10904752463102341, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2941 + }, + { + "epoch": 2.3498402555910545, + "grad_norm": 0.05331781879067421, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2942 + }, + { + "epoch": 2.3506389776357826, + "grad_norm": 0.1213313564658165, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2943 + }, + { + "epoch": 2.351437699680511, + "grad_norm": 0.12995922565460205, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 2944 + }, + { + "epoch": 2.3522364217252396, + "grad_norm": 0.05770767107605934, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2945 + }, + { + "epoch": 2.353035143769968, + "grad_norm": 0.09310754388570786, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 2946 + }, + { + "epoch": 2.3538338658146967, + "grad_norm": 0.17539645731449127, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2947 + }, + { + "epoch": 2.3546325878594248, + "grad_norm": 0.14126333594322205, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2948 + }, + { + "epoch": 2.3554313099041533, + "grad_norm": 0.04220091179013252, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2949 + }, + { + "epoch": 2.356230031948882, + "grad_norm": 0.14341594278812408, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2950 + }, + { + "epoch": 2.3570287539936103, + "grad_norm": 0.13884525001049042, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2951 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 0.040859755128622055, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2952 + }, + { + "epoch": 2.358626198083067, + "grad_norm": 0.14475658535957336, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 2953 + }, + { + "epoch": 2.3594249201277955, + "grad_norm": 0.18962377309799194, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2954 + }, + { + "epoch": 2.360223642172524, + "grad_norm": 0.0909075066447258, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2955 + }, + { + "epoch": 2.3610223642172525, + "grad_norm": 0.08225106447935104, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2956 + }, + { + "epoch": 2.3618210862619806, + "grad_norm": 0.1564486026763916, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2957 + }, + { + "epoch": 2.362619808306709, + "grad_norm": 0.08859751373529434, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2958 + }, + { + "epoch": 2.3634185303514377, + "grad_norm": 0.10907880961894989, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2959 + }, + { + "epoch": 2.364217252396166, + "grad_norm": 0.2368745654821396, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2960 + }, + { + "epoch": 2.3650159744408947, + "grad_norm": 0.15427371859550476, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2961 + }, + { + "epoch": 2.365814696485623, + "grad_norm": 0.07661470025777817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2962 + }, + { + "epoch": 2.3666134185303513, + "grad_norm": 0.2368732988834381, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2963 + }, + { + "epoch": 2.36741214057508, + "grad_norm": 0.24830125272274017, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2964 + }, + { + "epoch": 2.3682108626198084, + "grad_norm": 0.06940490007400513, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2965 + }, + { + "epoch": 2.369009584664537, + "grad_norm": 0.18672171235084534, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 2966 + }, + { + "epoch": 2.369808306709265, + "grad_norm": 0.22521120309829712, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2967 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 0.0496690534055233, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2968 + }, + { + "epoch": 2.371405750798722, + "grad_norm": 0.16735650599002838, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2969 + }, + { + "epoch": 2.3722044728434506, + "grad_norm": 0.18583746254444122, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2970 + }, + { + "epoch": 2.373003194888179, + "grad_norm": 0.03828646242618561, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2971 + }, + { + "epoch": 2.373801916932907, + "grad_norm": 0.14302043616771698, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2972 + }, + { + "epoch": 2.3746006389776357, + "grad_norm": 0.14217248558998108, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2973 + }, + { + "epoch": 2.3753993610223643, + "grad_norm": 0.08656741678714752, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2974 + }, + { + "epoch": 2.376198083067093, + "grad_norm": 0.18724001944065094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2975 + }, + { + "epoch": 2.376996805111821, + "grad_norm": 0.21609556674957275, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2976 + }, + { + "epoch": 2.3777955271565494, + "grad_norm": 0.08098721504211426, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2977 + }, + { + "epoch": 2.378594249201278, + "grad_norm": 0.09842941910028458, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2978 + }, + { + "epoch": 2.3793929712460065, + "grad_norm": 0.14060764014720917, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2979 + }, + { + "epoch": 2.380191693290735, + "grad_norm": 0.063141830265522, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2980 + }, + { + "epoch": 2.380990415335463, + "grad_norm": 0.10411619395017624, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2981 + }, + { + "epoch": 2.3817891373801916, + "grad_norm": 0.15445855259895325, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2982 + }, + { + "epoch": 2.38258785942492, + "grad_norm": 0.07754000276327133, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2983 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 0.05312122777104378, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2984 + }, + { + "epoch": 2.384185303514377, + "grad_norm": 0.09916596859693527, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2985 + }, + { + "epoch": 2.3849840255591053, + "grad_norm": 0.12749150395393372, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2986 + }, + { + "epoch": 2.385782747603834, + "grad_norm": 0.054589178413152695, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2987 + }, + { + "epoch": 2.3865814696485623, + "grad_norm": 0.08480732887983322, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2988 + }, + { + "epoch": 2.387380191693291, + "grad_norm": 0.13158805668354034, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 2989 + }, + { + "epoch": 2.3881789137380194, + "grad_norm": 0.11916540563106537, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2990 + }, + { + "epoch": 2.3889776357827475, + "grad_norm": 0.05829031020402908, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2991 + }, + { + "epoch": 2.389776357827476, + "grad_norm": 0.18292354047298431, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2992 + }, + { + "epoch": 2.3905750798722045, + "grad_norm": 0.18494512140750885, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2993 + }, + { + "epoch": 2.391373801916933, + "grad_norm": 0.06371760368347168, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2994 + }, + { + "epoch": 2.392172523961661, + "grad_norm": 0.10157672315835953, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2995 + }, + { + "epoch": 2.3929712460063897, + "grad_norm": 0.13981172442436218, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2996 + }, + { + "epoch": 2.393769968051118, + "grad_norm": 0.07794835418462753, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2997 + }, + { + "epoch": 2.3945686900958467, + "grad_norm": 0.038293492048978806, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2998 + }, + { + "epoch": 2.3953674121405752, + "grad_norm": 0.06315408647060394, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2999 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 0.045907966792583466, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3000 + }, + { + "epoch": 2.396964856230032, + "grad_norm": 0.038717497140169144, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3001 + }, + { + "epoch": 2.3977635782747604, + "grad_norm": 0.0376095287501812, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3002 + }, + { + "epoch": 2.398562300319489, + "grad_norm": 0.05739009007811546, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3003 + }, + { + "epoch": 2.3993610223642174, + "grad_norm": 0.034832656383514404, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3004 + }, + { + "epoch": 2.4001597444089455, + "grad_norm": 0.06432276219129562, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3005 + }, + { + "epoch": 2.400958466453674, + "grad_norm": 0.05443817004561424, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3006 + }, + { + "epoch": 2.4017571884984026, + "grad_norm": 0.04691087454557419, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3007 + }, + { + "epoch": 2.402555910543131, + "grad_norm": 0.04394471272826195, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3008 + }, + { + "epoch": 2.4033546325878596, + "grad_norm": 0.03642019256949425, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3009 + }, + { + "epoch": 2.4041533546325877, + "grad_norm": 0.05891808122396469, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3010 + }, + { + "epoch": 2.4049520766773163, + "grad_norm": 0.04530616104602814, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3011 + }, + { + "epoch": 2.405750798722045, + "grad_norm": 0.0518258772790432, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3012 + }, + { + "epoch": 2.4065495207667733, + "grad_norm": 0.11279664188623428, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3013 + }, + { + "epoch": 2.4073482428115014, + "grad_norm": 0.10047753900289536, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3014 + }, + { + "epoch": 2.40814696485623, + "grad_norm": 0.06645897775888443, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3015 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 0.03372915834188461, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3016 + }, + { + "epoch": 2.409744408945687, + "grad_norm": 0.05353475734591484, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3017 + }, + { + "epoch": 2.4105431309904155, + "grad_norm": 0.038493942469358444, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3018 + }, + { + "epoch": 2.4113418530351436, + "grad_norm": 0.07303082197904587, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3019 + }, + { + "epoch": 2.412140575079872, + "grad_norm": 0.043219298124313354, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3020 + }, + { + "epoch": 2.4129392971246006, + "grad_norm": 0.05016458407044411, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3021 + }, + { + "epoch": 2.413738019169329, + "grad_norm": 0.08490880578756332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3022 + }, + { + "epoch": 2.4145367412140573, + "grad_norm": 0.07245411723852158, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3023 + }, + { + "epoch": 2.415335463258786, + "grad_norm": 0.052343063056468964, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3024 + }, + { + "epoch": 2.4161341853035143, + "grad_norm": 0.13449524343013763, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3025 + }, + { + "epoch": 2.416932907348243, + "grad_norm": 0.13177144527435303, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3026 + }, + { + "epoch": 2.4177316293929714, + "grad_norm": 0.06579594314098358, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 3027 + }, + { + "epoch": 2.4185303514377, + "grad_norm": 0.12716646492481232, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3028 + }, + { + "epoch": 2.419329073482428, + "grad_norm": 0.20006005465984344, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3029 + }, + { + "epoch": 2.4201277955271565, + "grad_norm": 0.16598355770111084, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3030 + }, + { + "epoch": 2.420926517571885, + "grad_norm": 0.06625109165906906, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3031 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.10521841049194336, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3032 + }, + { + "epoch": 2.4225239616613417, + "grad_norm": 0.14134426414966583, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3033 + }, + { + "epoch": 2.42332268370607, + "grad_norm": 0.056669678539037704, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3034 + }, + { + "epoch": 2.4241214057507987, + "grad_norm": 0.052738044410943985, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3035 + }, + { + "epoch": 2.4249201277955272, + "grad_norm": 0.06623729318380356, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3036 + }, + { + "epoch": 2.4257188498402558, + "grad_norm": 0.04038512706756592, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3037 + }, + { + "epoch": 2.426517571884984, + "grad_norm": 0.057600609958171844, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3038 + }, + { + "epoch": 2.4273162939297124, + "grad_norm": 0.08174199610948563, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3039 + }, + { + "epoch": 2.428115015974441, + "grad_norm": 0.07850457727909088, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3040 + }, + { + "epoch": 2.4289137380191694, + "grad_norm": 0.04368523135781288, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3041 + }, + { + "epoch": 2.4297124600638975, + "grad_norm": 0.11637478321790695, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3042 + }, + { + "epoch": 2.430511182108626, + "grad_norm": 0.09765078872442245, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3043 + }, + { + "epoch": 2.4313099041533546, + "grad_norm": 0.04842933267354965, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3044 + }, + { + "epoch": 2.432108626198083, + "grad_norm": 0.08858928829431534, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3045 + }, + { + "epoch": 2.4329073482428116, + "grad_norm": 0.12645326554775238, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3046 + }, + { + "epoch": 2.43370607028754, + "grad_norm": 0.09839878976345062, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 3047 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 0.04484904557466507, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3048 + }, + { + "epoch": 2.4353035143769968, + "grad_norm": 0.13912586867809296, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3049 + }, + { + "epoch": 2.4361022364217253, + "grad_norm": 0.18569444119930267, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3050 + }, + { + "epoch": 2.436900958466454, + "grad_norm": 0.13544169068336487, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3051 + }, + { + "epoch": 2.437699680511182, + "grad_norm": 0.04663483425974846, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3052 + }, + { + "epoch": 2.4384984025559104, + "grad_norm": 0.11609578132629395, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3053 + }, + { + "epoch": 2.439297124600639, + "grad_norm": 0.17497499287128448, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3054 + }, + { + "epoch": 2.4400958466453675, + "grad_norm": 0.19216352701187134, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3055 + }, + { + "epoch": 2.440894568690096, + "grad_norm": 0.11638841032981873, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3056 + }, + { + "epoch": 2.441693290734824, + "grad_norm": 0.05816149711608887, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3057 + }, + { + "epoch": 2.4424920127795526, + "grad_norm": 0.1650087982416153, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3058 + }, + { + "epoch": 2.443290734824281, + "grad_norm": 0.2105383425951004, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3059 + }, + { + "epoch": 2.4440894568690097, + "grad_norm": 0.133597731590271, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3060 + }, + { + "epoch": 2.4448881789137378, + "grad_norm": 0.03882076218724251, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3061 + }, + { + "epoch": 2.4456869009584663, + "grad_norm": 0.08914566785097122, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3062 + }, + { + "epoch": 2.446485623003195, + "grad_norm": 0.08115291595458984, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3063 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 0.0402134470641613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3064 + }, + { + "epoch": 2.448083067092652, + "grad_norm": 0.12838906049728394, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3065 + }, + { + "epoch": 2.4488817891373804, + "grad_norm": 0.1865018606185913, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3066 + }, + { + "epoch": 2.4496805111821085, + "grad_norm": 0.13134929537773132, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3067 + }, + { + "epoch": 2.450479233226837, + "grad_norm": 0.05415928363800049, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3068 + }, + { + "epoch": 2.4512779552715656, + "grad_norm": 0.0739838033914566, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3069 + }, + { + "epoch": 2.452076677316294, + "grad_norm": 0.07965957373380661, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 3070 + }, + { + "epoch": 2.452875399361022, + "grad_norm": 0.0416380800306797, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3071 + }, + { + "epoch": 2.4536741214057507, + "grad_norm": 0.03494519367814064, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3072 + }, + { + "epoch": 2.4544728434504792, + "grad_norm": 0.050772733986377716, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3073 + }, + { + "epoch": 2.4552715654952078, + "grad_norm": 0.03939373791217804, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3074 + }, + { + "epoch": 2.4560702875399363, + "grad_norm": 0.11769624799489975, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3075 + }, + { + "epoch": 2.4568690095846644, + "grad_norm": 0.33884114027023315, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3076 + }, + { + "epoch": 2.457667731629393, + "grad_norm": 0.07171089947223663, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3077 + }, + { + "epoch": 2.4584664536741214, + "grad_norm": 0.0707232877612114, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3078 + }, + { + "epoch": 2.45926517571885, + "grad_norm": 0.14245279133319855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3079 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.12356095761060715, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3080 + }, + { + "epoch": 2.4608626198083066, + "grad_norm": 0.0694037601351738, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3081 + }, + { + "epoch": 2.461661341853035, + "grad_norm": 0.0511220321059227, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3082 + }, + { + "epoch": 2.4624600638977636, + "grad_norm": 0.10915348678827286, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3083 + }, + { + "epoch": 2.463258785942492, + "grad_norm": 0.10797106474637985, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3084 + }, + { + "epoch": 2.4640575079872207, + "grad_norm": 0.05721200630068779, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3085 + }, + { + "epoch": 2.4648562300319488, + "grad_norm": 0.04477681592106819, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3086 + }, + { + "epoch": 2.4656549520766773, + "grad_norm": 0.08826448023319244, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3087 + }, + { + "epoch": 2.466453674121406, + "grad_norm": 0.1024692952632904, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3088 + }, + { + "epoch": 2.4672523961661343, + "grad_norm": 0.06543146073818207, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3089 + }, + { + "epoch": 2.4680511182108624, + "grad_norm": 0.06146182119846344, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3090 + }, + { + "epoch": 2.468849840255591, + "grad_norm": 0.12857408821582794, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3091 + }, + { + "epoch": 2.4696485623003195, + "grad_norm": 0.12273124605417252, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3092 + }, + { + "epoch": 2.470447284345048, + "grad_norm": 0.06467662751674652, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3093 + }, + { + "epoch": 2.4712460063897765, + "grad_norm": 0.07181179523468018, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3094 + }, + { + "epoch": 2.4720447284345046, + "grad_norm": 0.20223456621170044, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3095 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 0.25061357021331787, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3096 + }, + { + "epoch": 2.4736421725239617, + "grad_norm": 0.16317492723464966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3097 + }, + { + "epoch": 2.47444089456869, + "grad_norm": 0.04005994647741318, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3098 + }, + { + "epoch": 2.4752396166134183, + "grad_norm": 0.15954583883285522, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3099 + }, + { + "epoch": 2.476038338658147, + "grad_norm": 0.2088920623064041, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3100 + }, + { + "epoch": 2.4768370607028753, + "grad_norm": 0.11643055826425552, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3101 + }, + { + "epoch": 2.477635782747604, + "grad_norm": 0.11083687841892242, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3102 + }, + { + "epoch": 2.4784345047923324, + "grad_norm": 0.24777425825595856, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3103 + }, + { + "epoch": 2.479233226837061, + "grad_norm": 0.19513146579265594, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3104 + }, + { + "epoch": 2.480031948881789, + "grad_norm": 0.05009200796484947, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3105 + }, + { + "epoch": 2.4808306709265175, + "grad_norm": 0.2673046588897705, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3106 + }, + { + "epoch": 2.481629392971246, + "grad_norm": 0.3035629093647003, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3107 + }, + { + "epoch": 2.4824281150159746, + "grad_norm": 0.13213352859020233, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3108 + }, + { + "epoch": 2.4832268370607027, + "grad_norm": 0.13605083525180817, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3109 + }, + { + "epoch": 2.484025559105431, + "grad_norm": 0.2958623170852661, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3110 + }, + { + "epoch": 2.4848242811501597, + "grad_norm": 0.23080390691757202, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3111 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 0.046950701624155045, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3112 + }, + { + "epoch": 2.486421725239617, + "grad_norm": 0.24903765320777893, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3113 + }, + { + "epoch": 2.487220447284345, + "grad_norm": 0.233968585729599, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3114 + }, + { + "epoch": 2.4880191693290734, + "grad_norm": 0.04709520563483238, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3115 + }, + { + "epoch": 2.488817891373802, + "grad_norm": 0.16599629819393158, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3116 + }, + { + "epoch": 2.4896166134185305, + "grad_norm": 0.19273866713047028, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3117 + }, + { + "epoch": 2.4904153354632586, + "grad_norm": 0.11514598876237869, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3118 + }, + { + "epoch": 2.491214057507987, + "grad_norm": 0.08656881004571915, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 3119 + }, + { + "epoch": 2.4920127795527156, + "grad_norm": 0.18213899433612823, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3120 + }, + { + "epoch": 2.492811501597444, + "grad_norm": 0.11029175668954849, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3121 + }, + { + "epoch": 2.4936102236421727, + "grad_norm": 0.04480903223156929, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3122 + }, + { + "epoch": 2.494408945686901, + "grad_norm": 0.04919225722551346, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3123 + }, + { + "epoch": 2.4952076677316293, + "grad_norm": 0.06349056959152222, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3124 + }, + { + "epoch": 2.496006389776358, + "grad_norm": 0.04066464304924011, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3125 + }, + { + "epoch": 2.4968051118210863, + "grad_norm": 0.03992457687854767, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3126 + }, + { + "epoch": 2.497603833865815, + "grad_norm": 0.04580394923686981, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3127 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.13679265975952148, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3128 + }, + { + "epoch": 2.4992012779552715, + "grad_norm": 0.20708884298801422, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3129 + }, + { + "epoch": 2.5, + "grad_norm": 0.22991639375686646, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3130 + }, + { + "epoch": 2.5007987220447285, + "grad_norm": 0.15380895137786865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3131 + }, + { + "epoch": 2.501597444089457, + "grad_norm": 0.05112789571285248, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3132 + }, + { + "epoch": 2.502396166134185, + "grad_norm": 0.19797906279563904, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3133 + }, + { + "epoch": 2.5031948881789137, + "grad_norm": 0.18190141022205353, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3134 + }, + { + "epoch": 2.503993610223642, + "grad_norm": 0.04291468858718872, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3135 + }, + { + "epoch": 2.5047923322683707, + "grad_norm": 0.14576731622219086, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3136 + }, + { + "epoch": 2.505591054313099, + "grad_norm": 0.25093281269073486, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3137 + }, + { + "epoch": 2.5063897763578273, + "grad_norm": 0.22738556563854218, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3138 + }, + { + "epoch": 2.507188498402556, + "grad_norm": 0.08985915035009384, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3139 + }, + { + "epoch": 2.5079872204472844, + "grad_norm": 0.09632397443056107, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3140 + }, + { + "epoch": 2.508785942492013, + "grad_norm": 0.12138333916664124, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3141 + }, + { + "epoch": 2.5095846645367414, + "grad_norm": 0.04163306951522827, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3142 + }, + { + "epoch": 2.5103833865814695, + "grad_norm": 0.06187185272574425, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3143 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.09463546425104141, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3144 + }, + { + "epoch": 2.5119808306709266, + "grad_norm": 0.12386980652809143, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3145 + }, + { + "epoch": 2.512779552715655, + "grad_norm": 0.07090163975954056, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3146 + }, + { + "epoch": 2.513578274760383, + "grad_norm": 0.04502219334244728, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3147 + }, + { + "epoch": 2.5143769968051117, + "grad_norm": 0.08453603833913803, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3148 + }, + { + "epoch": 2.5151757188498403, + "grad_norm": 0.08686821907758713, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3149 + }, + { + "epoch": 2.515974440894569, + "grad_norm": 0.03968734294176102, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3150 + }, + { + "epoch": 2.5167731629392973, + "grad_norm": 0.08613990992307663, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3151 + }, + { + "epoch": 2.5175718849840254, + "grad_norm": 0.07950794696807861, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3152 + }, + { + "epoch": 2.518370607028754, + "grad_norm": 0.0449741929769516, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3153 + }, + { + "epoch": 2.5191693290734825, + "grad_norm": 0.09032034873962402, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3154 + }, + { + "epoch": 2.519968051118211, + "grad_norm": 0.06834430247545242, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3155 + }, + { + "epoch": 2.520766773162939, + "grad_norm": 0.13820379972457886, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3156 + }, + { + "epoch": 2.5215654952076676, + "grad_norm": 0.17753586173057556, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3157 + }, + { + "epoch": 2.522364217252396, + "grad_norm": 0.2663286626338959, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3158 + }, + { + "epoch": 2.5231629392971247, + "grad_norm": 0.21509577333927155, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3159 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.04614022746682167, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3160 + }, + { + "epoch": 2.5247603833865817, + "grad_norm": 0.13719527423381805, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3161 + }, + { + "epoch": 2.52555910543131, + "grad_norm": 0.20119087398052216, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3162 + }, + { + "epoch": 2.5263578274760383, + "grad_norm": 0.1822054237127304, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3163 + }, + { + "epoch": 2.527156549520767, + "grad_norm": 0.06550543755292892, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3164 + }, + { + "epoch": 2.527955271565495, + "grad_norm": 0.08079471439123154, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3165 + }, + { + "epoch": 2.5287539936102235, + "grad_norm": 0.10106988251209259, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3166 + }, + { + "epoch": 2.529552715654952, + "grad_norm": 0.06818784028291702, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3167 + }, + { + "epoch": 2.5303514376996805, + "grad_norm": 0.05976718291640282, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3168 + }, + { + "epoch": 2.531150159744409, + "grad_norm": 0.18163853883743286, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3169 + }, + { + "epoch": 2.5319488817891376, + "grad_norm": 0.26418858766555786, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3170 + }, + { + "epoch": 2.5327476038338657, + "grad_norm": 0.24044150114059448, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3171 + }, + { + "epoch": 2.533546325878594, + "grad_norm": 0.07499254494905472, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3172 + }, + { + "epoch": 2.5343450479233227, + "grad_norm": 0.17483314871788025, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3173 + }, + { + "epoch": 2.5351437699680512, + "grad_norm": 0.2698160707950592, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3174 + }, + { + "epoch": 2.5359424920127793, + "grad_norm": 0.2116270661354065, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3175 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.0545198880136013, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3176 + }, + { + "epoch": 2.5375399361022364, + "grad_norm": 0.1926649659872055, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3177 + }, + { + "epoch": 2.538338658146965, + "grad_norm": 0.24152790009975433, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3178 + }, + { + "epoch": 2.5391373801916934, + "grad_norm": 0.12380969524383545, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3179 + }, + { + "epoch": 2.539936102236422, + "grad_norm": 0.07934054732322693, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3180 + }, + { + "epoch": 2.54073482428115, + "grad_norm": 0.13688413798809052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3181 + }, + { + "epoch": 2.5415335463258786, + "grad_norm": 0.05832000821828842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3182 + }, + { + "epoch": 2.542332268370607, + "grad_norm": 0.08729993551969528, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3183 + }, + { + "epoch": 2.543130990415335, + "grad_norm": 0.16843630373477936, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3184 + }, + { + "epoch": 2.5439297124600637, + "grad_norm": 0.13045506179332733, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3185 + }, + { + "epoch": 2.5447284345047922, + "grad_norm": 0.038882140070199966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3186 + }, + { + "epoch": 2.5455271565495208, + "grad_norm": 0.14922545850276947, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3187 + }, + { + "epoch": 2.5463258785942493, + "grad_norm": 0.1961440145969391, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3188 + }, + { + "epoch": 2.547124600638978, + "grad_norm": 0.08585302531719208, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3189 + }, + { + "epoch": 2.547923322683706, + "grad_norm": 0.13141697645187378, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3190 + }, + { + "epoch": 2.5487220447284344, + "grad_norm": 0.20332233607769012, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3191 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.1740144044160843, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3192 + }, + { + "epoch": 2.5503194888178915, + "grad_norm": 0.04738207906484604, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3193 + }, + { + "epoch": 2.5511182108626196, + "grad_norm": 0.23204317688941956, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3194 + }, + { + "epoch": 2.551916932907348, + "grad_norm": 0.29033714532852173, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3195 + }, + { + "epoch": 2.5527156549520766, + "grad_norm": 0.1251334547996521, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3196 + }, + { + "epoch": 2.553514376996805, + "grad_norm": 0.1610727608203888, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3197 + }, + { + "epoch": 2.5543130990415337, + "grad_norm": 0.284105509519577, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3198 + }, + { + "epoch": 2.5551118210862622, + "grad_norm": 0.1530643254518509, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 3199 + }, + { + "epoch": 2.5559105431309903, + "grad_norm": 0.07761498540639877, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3200 + }, + { + "epoch": 2.556709265175719, + "grad_norm": 0.16693277657032013, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3201 + }, + { + "epoch": 2.5575079872204474, + "grad_norm": 0.06345608085393906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3202 + }, + { + "epoch": 2.5583067092651754, + "grad_norm": 0.10956210643053055, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3203 + }, + { + "epoch": 2.559105431309904, + "grad_norm": 0.17655007541179657, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3204 + }, + { + "epoch": 2.5599041533546325, + "grad_norm": 0.12615050375461578, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3205 + }, + { + "epoch": 2.560702875399361, + "grad_norm": 0.049671441316604614, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3206 + }, + { + "epoch": 2.5615015974440896, + "grad_norm": 0.16559815406799316, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3207 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 0.1279190182685852, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3208 + }, + { + "epoch": 2.563099041533546, + "grad_norm": 0.0540652722120285, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3209 + }, + { + "epoch": 2.5638977635782747, + "grad_norm": 0.1287074238061905, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 3210 + }, + { + "epoch": 2.5646964856230032, + "grad_norm": 0.1118067055940628, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3211 + }, + { + "epoch": 2.5654952076677318, + "grad_norm": 0.05159451439976692, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3212 + }, + { + "epoch": 2.56629392971246, + "grad_norm": 0.10654652118682861, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3213 + }, + { + "epoch": 2.5670926517571884, + "grad_norm": 0.15669982135295868, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3214 + }, + { + "epoch": 2.567891373801917, + "grad_norm": 0.11388157308101654, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3215 + }, + { + "epoch": 2.5686900958466454, + "grad_norm": 0.06434119492769241, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3216 + }, + { + "epoch": 2.569488817891374, + "grad_norm": 0.050070468336343765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3217 + }, + { + "epoch": 2.5702875399361025, + "grad_norm": 0.0522335022687912, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3218 + }, + { + "epoch": 2.5710862619808306, + "grad_norm": 0.04716494306921959, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3219 + }, + { + "epoch": 2.571884984025559, + "grad_norm": 0.03770711272954941, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3220 + }, + { + "epoch": 2.5726837060702876, + "grad_norm": 0.03955485299229622, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3221 + }, + { + "epoch": 2.5734824281150157, + "grad_norm": 0.03824841231107712, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3222 + }, + { + "epoch": 2.5742811501597442, + "grad_norm": 0.04722970351576805, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3223 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 0.05470758676528931, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3224 + }, + { + "epoch": 2.5758785942492013, + "grad_norm": 0.04934269189834595, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3225 + }, + { + "epoch": 2.57667731629393, + "grad_norm": 0.040627289563417435, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3226 + }, + { + "epoch": 2.5774760383386583, + "grad_norm": 0.05668056383728981, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3227 + }, + { + "epoch": 2.5782747603833864, + "grad_norm": 0.11724753677845001, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3228 + }, + { + "epoch": 2.579073482428115, + "grad_norm": 0.12204517424106598, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3229 + }, + { + "epoch": 2.5798722044728435, + "grad_norm": 0.10652083158493042, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3230 + }, + { + "epoch": 2.580670926517572, + "grad_norm": 0.07430299371480942, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3231 + }, + { + "epoch": 2.5814696485623, + "grad_norm": 0.03460770472884178, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3232 + }, + { + "epoch": 2.5822683706070286, + "grad_norm": 0.080150306224823, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3233 + }, + { + "epoch": 2.583067092651757, + "grad_norm": 0.1291198879480362, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3234 + }, + { + "epoch": 2.5838658146964857, + "grad_norm": 0.19541533291339874, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3235 + }, + { + "epoch": 2.584664536741214, + "grad_norm": 0.24089939892292023, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3236 + }, + { + "epoch": 2.5854632587859427, + "grad_norm": 0.1933099627494812, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3237 + }, + { + "epoch": 2.586261980830671, + "grad_norm": 0.07295489311218262, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3238 + }, + { + "epoch": 2.5870607028753994, + "grad_norm": 0.10686071962118149, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3239 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 0.17052637040615082, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3240 + }, + { + "epoch": 2.588658146964856, + "grad_norm": 0.12377535551786423, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3241 + }, + { + "epoch": 2.5894568690095845, + "grad_norm": 0.03730800375342369, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3242 + }, + { + "epoch": 2.590255591054313, + "grad_norm": 0.13848428428173065, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3243 + }, + { + "epoch": 2.5910543130990416, + "grad_norm": 0.18361017107963562, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3244 + }, + { + "epoch": 2.59185303514377, + "grad_norm": 0.11140795797109604, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3245 + }, + { + "epoch": 2.5926517571884986, + "grad_norm": 0.033891428261995316, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3246 + }, + { + "epoch": 2.5934504792332267, + "grad_norm": 0.13179628551006317, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3247 + }, + { + "epoch": 2.594249201277955, + "grad_norm": 0.19785374402999878, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3248 + }, + { + "epoch": 2.5950479233226837, + "grad_norm": 0.15991398692131042, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3249 + }, + { + "epoch": 2.5958466453674123, + "grad_norm": 0.0702645480632782, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3250 + }, + { + "epoch": 2.5966453674121404, + "grad_norm": 0.038220152258872986, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3251 + }, + { + "epoch": 2.597444089456869, + "grad_norm": 0.048042308539152145, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3252 + }, + { + "epoch": 2.5982428115015974, + "grad_norm": 0.05673132464289665, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3253 + }, + { + "epoch": 2.599041533546326, + "grad_norm": 0.057284750044345856, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3254 + }, + { + "epoch": 2.5998402555910545, + "grad_norm": 0.052904874086380005, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3255 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.04914860427379608, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3256 + }, + { + "epoch": 2.601437699680511, + "grad_norm": 0.08870472013950348, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3257 + }, + { + "epoch": 2.6022364217252396, + "grad_norm": 0.09863728284835815, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3258 + }, + { + "epoch": 2.603035143769968, + "grad_norm": 0.08116353303194046, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3259 + }, + { + "epoch": 2.6038338658146962, + "grad_norm": 0.043653007596731186, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3260 + }, + { + "epoch": 2.6046325878594248, + "grad_norm": 0.0579618401825428, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3261 + }, + { + "epoch": 2.6054313099041533, + "grad_norm": 0.08072935789823532, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3262 + }, + { + "epoch": 2.606230031948882, + "grad_norm": 0.05391686409711838, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3263 + }, + { + "epoch": 2.6070287539936103, + "grad_norm": 0.03471128270030022, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3264 + }, + { + "epoch": 2.607827476038339, + "grad_norm": 0.056328870356082916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3265 + }, + { + "epoch": 2.608626198083067, + "grad_norm": 0.05196002125740051, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3266 + }, + { + "epoch": 2.6094249201277955, + "grad_norm": 0.04338999465107918, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3267 + }, + { + "epoch": 2.610223642172524, + "grad_norm": 0.12365762889385223, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3268 + }, + { + "epoch": 2.6110223642172525, + "grad_norm": 0.19469699263572693, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3269 + }, + { + "epoch": 2.6118210862619806, + "grad_norm": 0.1825639009475708, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3270 + }, + { + "epoch": 2.612619808306709, + "grad_norm": 0.10235249251127243, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3271 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 0.05571124702692032, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3272 + }, + { + "epoch": 2.614217252396166, + "grad_norm": 0.1536952704191208, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3273 + }, + { + "epoch": 2.6150159744408947, + "grad_norm": 0.163212850689888, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3274 + }, + { + "epoch": 2.6158146964856233, + "grad_norm": 0.09640593826770782, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3275 + }, + { + "epoch": 2.6166134185303513, + "grad_norm": 0.04329126700758934, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3276 + }, + { + "epoch": 2.61741214057508, + "grad_norm": 0.03598733991384506, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3277 + }, + { + "epoch": 2.6182108626198084, + "grad_norm": 0.046664439141750336, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3278 + }, + { + "epoch": 2.6190095846645365, + "grad_norm": 0.03692904859781265, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3279 + }, + { + "epoch": 2.619808306709265, + "grad_norm": 0.0482964888215065, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3280 + }, + { + "epoch": 2.6206070287539935, + "grad_norm": 0.07996834069490433, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3281 + }, + { + "epoch": 2.621405750798722, + "grad_norm": 0.060141101479530334, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3282 + }, + { + "epoch": 2.6222044728434506, + "grad_norm": 0.04013051837682724, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 3283 + }, + { + "epoch": 2.623003194888179, + "grad_norm": 0.04011296480894089, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3284 + }, + { + "epoch": 2.623801916932907, + "grad_norm": 0.04112064838409424, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3285 + }, + { + "epoch": 2.6246006389776357, + "grad_norm": 0.057281915098428726, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3286 + }, + { + "epoch": 2.6253993610223643, + "grad_norm": 0.06061771139502525, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3287 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 0.05844549089670181, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3288 + }, + { + "epoch": 2.626996805111821, + "grad_norm": 0.06354600191116333, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3289 + }, + { + "epoch": 2.6277955271565494, + "grad_norm": 0.04568248987197876, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3290 + }, + { + "epoch": 2.628594249201278, + "grad_norm": 0.04340318217873573, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3291 + }, + { + "epoch": 2.6293929712460065, + "grad_norm": 0.07078617066144943, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3292 + }, + { + "epoch": 2.630191693290735, + "grad_norm": 0.09865503013134003, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3293 + }, + { + "epoch": 2.6309904153354635, + "grad_norm": 0.08623871207237244, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3294 + }, + { + "epoch": 2.6317891373801916, + "grad_norm": 0.03787717968225479, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3295 + }, + { + "epoch": 2.63258785942492, + "grad_norm": 0.14653000235557556, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3296 + }, + { + "epoch": 2.6333865814696487, + "grad_norm": 0.2749452292919159, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3297 + }, + { + "epoch": 2.6341853035143767, + "grad_norm": 0.28424543142318726, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3298 + }, + { + "epoch": 2.6349840255591053, + "grad_norm": 0.17354224622249603, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3299 + }, + { + "epoch": 2.635782747603834, + "grad_norm": 0.04208464175462723, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3300 + }, + { + "epoch": 2.6365814696485623, + "grad_norm": 0.15522420406341553, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3301 + }, + { + "epoch": 2.637380191693291, + "grad_norm": 0.17986370623111725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3302 + }, + { + "epoch": 2.6381789137380194, + "grad_norm": 0.07155515998601913, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3303 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.11287503689527512, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3304 + }, + { + "epoch": 2.639776357827476, + "grad_norm": 0.22735139727592468, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3305 + }, + { + "epoch": 2.6405750798722045, + "grad_norm": 0.23528814315795898, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3306 + }, + { + "epoch": 2.641373801916933, + "grad_norm": 0.13828198611736298, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3307 + }, + { + "epoch": 2.642172523961661, + "grad_norm": 0.046783462166786194, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3308 + }, + { + "epoch": 2.6429712460063897, + "grad_norm": 0.13010001182556152, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3309 + }, + { + "epoch": 2.643769968051118, + "grad_norm": 0.12339942902326584, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3310 + }, + { + "epoch": 2.6445686900958467, + "grad_norm": 0.06443019211292267, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3311 + }, + { + "epoch": 2.6453674121405752, + "grad_norm": 0.05086766183376312, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3312 + }, + { + "epoch": 2.6461661341853038, + "grad_norm": 0.1266956627368927, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3313 + }, + { + "epoch": 2.646964856230032, + "grad_norm": 0.1238899901509285, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3314 + }, + { + "epoch": 2.6477635782747604, + "grad_norm": 0.07378736138343811, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3315 + }, + { + "epoch": 2.648562300319489, + "grad_norm": 0.12572194635868073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3316 + }, + { + "epoch": 2.649361022364217, + "grad_norm": 0.18099260330200195, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3317 + }, + { + "epoch": 2.6501597444089455, + "grad_norm": 0.1383541077375412, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3318 + }, + { + "epoch": 2.650958466453674, + "grad_norm": 0.043900374323129654, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3319 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 0.13228318095207214, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3320 + }, + { + "epoch": 2.652555910543131, + "grad_norm": 0.11684399843215942, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3321 + }, + { + "epoch": 2.6533546325878596, + "grad_norm": 0.03879965469241142, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3322 + }, + { + "epoch": 2.6541533546325877, + "grad_norm": 0.1457953006029129, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3323 + }, + { + "epoch": 2.6549520766773163, + "grad_norm": 0.21643802523612976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3324 + }, + { + "epoch": 2.655750798722045, + "grad_norm": 0.20250067114830017, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3325 + }, + { + "epoch": 2.6565495207667733, + "grad_norm": 0.09131773561239243, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3326 + }, + { + "epoch": 2.6573482428115014, + "grad_norm": 0.07217761129140854, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3327 + }, + { + "epoch": 2.65814696485623, + "grad_norm": 0.13251517713069916, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3328 + }, + { + "epoch": 2.6589456869009584, + "grad_norm": 0.09462655335664749, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3329 + }, + { + "epoch": 2.659744408945687, + "grad_norm": 0.04496161639690399, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3330 + }, + { + "epoch": 2.6605431309904155, + "grad_norm": 0.13246162235736847, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3331 + }, + { + "epoch": 2.661341853035144, + "grad_norm": 0.1548391878604889, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3332 + }, + { + "epoch": 2.662140575079872, + "grad_norm": 0.09438800066709518, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3333 + }, + { + "epoch": 2.6629392971246006, + "grad_norm": 0.033411599695682526, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3334 + }, + { + "epoch": 2.663738019169329, + "grad_norm": 0.04015564173460007, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3335 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 0.033046361058950424, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3336 + }, + { + "epoch": 2.665335463258786, + "grad_norm": 0.04766019433736801, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3337 + }, + { + "epoch": 2.6661341853035143, + "grad_norm": 0.06365641951560974, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3338 + }, + { + "epoch": 2.666932907348243, + "grad_norm": 0.03329809010028839, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3339 + }, + { + "epoch": 2.6677316293929714, + "grad_norm": 0.10063061863183975, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3340 + }, + { + "epoch": 2.6685303514377, + "grad_norm": 0.16541579365730286, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 3341 + }, + { + "epoch": 2.669329073482428, + "grad_norm": 0.18877379596233368, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3342 + }, + { + "epoch": 2.6701277955271565, + "grad_norm": 0.12577234208583832, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3343 + }, + { + "epoch": 2.670926517571885, + "grad_norm": 0.04403039440512657, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3344 + }, + { + "epoch": 2.6717252396166136, + "grad_norm": 0.172403946518898, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3345 + }, + { + "epoch": 2.6725239616613417, + "grad_norm": 0.2147791087627411, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3346 + }, + { + "epoch": 2.67332268370607, + "grad_norm": 0.1536005735397339, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3347 + }, + { + "epoch": 2.6741214057507987, + "grad_norm": 0.061038631945848465, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3348 + }, + { + "epoch": 2.6749201277955272, + "grad_norm": 0.03402748703956604, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3349 + }, + { + "epoch": 2.6757188498402558, + "grad_norm": 0.05285736918449402, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3350 + }, + { + "epoch": 2.6765175718849843, + "grad_norm": 0.0807662233710289, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3351 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.057097889482975006, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3352 + }, + { + "epoch": 2.678115015974441, + "grad_norm": 0.06845760345458984, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3353 + }, + { + "epoch": 2.6789137380191694, + "grad_norm": 0.1209796816110611, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3354 + }, + { + "epoch": 2.6797124600638975, + "grad_norm": 0.09372428804636002, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3355 + }, + { + "epoch": 2.680511182108626, + "grad_norm": 0.03795485943555832, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3356 + }, + { + "epoch": 2.6813099041533546, + "grad_norm": 0.14420334994792938, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3357 + }, + { + "epoch": 2.682108626198083, + "grad_norm": 0.23049019277095795, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3358 + }, + { + "epoch": 2.6829073482428116, + "grad_norm": 0.21722057461738586, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3359 + }, + { + "epoch": 2.68370607028754, + "grad_norm": 0.0968366488814354, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3360 + }, + { + "epoch": 2.6845047923322682, + "grad_norm": 0.10279416292905807, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3361 + }, + { + "epoch": 2.6853035143769968, + "grad_norm": 0.2077404409646988, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3362 + }, + { + "epoch": 2.6861022364217253, + "grad_norm": 0.14186711609363556, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3363 + }, + { + "epoch": 2.686900958466454, + "grad_norm": 0.04573604837059975, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3364 + }, + { + "epoch": 2.687699680511182, + "grad_norm": 0.13861627876758575, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3365 + }, + { + "epoch": 2.6884984025559104, + "grad_norm": 0.17746120691299438, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3366 + }, + { + "epoch": 2.689297124600639, + "grad_norm": 0.15865683555603027, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3367 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.05537402629852295, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3368 + }, + { + "epoch": 2.690894568690096, + "grad_norm": 0.064423106610775, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3369 + }, + { + "epoch": 2.6916932907348246, + "grad_norm": 0.0922585278749466, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3370 + }, + { + "epoch": 2.6924920127795526, + "grad_norm": 0.08034171909093857, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3371 + }, + { + "epoch": 2.693290734824281, + "grad_norm": 0.05695292726159096, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3372 + }, + { + "epoch": 2.6940894568690097, + "grad_norm": 0.04140406847000122, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3373 + }, + { + "epoch": 2.6948881789137378, + "grad_norm": 0.038130711764097214, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3374 + }, + { + "epoch": 2.6956869009584663, + "grad_norm": 0.07363594323396683, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3375 + }, + { + "epoch": 2.696485623003195, + "grad_norm": 0.13670513033866882, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3376 + }, + { + "epoch": 2.6972843450479234, + "grad_norm": 0.16614536941051483, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3377 + }, + { + "epoch": 2.698083067092652, + "grad_norm": 0.1346762478351593, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3378 + }, + { + "epoch": 2.6988817891373804, + "grad_norm": 0.06321856379508972, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3379 + }, + { + "epoch": 2.6996805111821085, + "grad_norm": 0.057517897337675095, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3380 + }, + { + "epoch": 2.700479233226837, + "grad_norm": 0.11995001137256622, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3381 + }, + { + "epoch": 2.7012779552715656, + "grad_norm": 0.10514877736568451, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3382 + }, + { + "epoch": 2.702076677316294, + "grad_norm": 0.05942686274647713, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3383 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 0.03508206829428673, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3384 + }, + { + "epoch": 2.7036741214057507, + "grad_norm": 0.05182692036032677, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3385 + }, + { + "epoch": 2.7044728434504792, + "grad_norm": 0.0597345344722271, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3386 + }, + { + "epoch": 2.7052715654952078, + "grad_norm": 0.037486087530851364, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3387 + }, + { + "epoch": 2.7060702875399363, + "grad_norm": 0.040483538061380386, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 3388 + }, + { + "epoch": 2.706869009584665, + "grad_norm": 0.044094670563936234, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3389 + }, + { + "epoch": 2.707667731629393, + "grad_norm": 0.06498228758573532, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3390 + }, + { + "epoch": 2.7084664536741214, + "grad_norm": 0.06955298781394958, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3391 + }, + { + "epoch": 2.70926517571885, + "grad_norm": 0.11691966652870178, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3392 + }, + { + "epoch": 2.710063897763578, + "grad_norm": 0.1183234304189682, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3393 + }, + { + "epoch": 2.7108626198083066, + "grad_norm": 0.08358792215585709, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3394 + }, + { + "epoch": 2.711661341853035, + "grad_norm": 0.04190056398510933, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3395 + }, + { + "epoch": 2.7124600638977636, + "grad_norm": 0.09757649153470993, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3396 + }, + { + "epoch": 2.713258785942492, + "grad_norm": 0.11508934944868088, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 3397 + }, + { + "epoch": 2.7140575079872207, + "grad_norm": 0.05612087994813919, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3398 + }, + { + "epoch": 2.7148562300319488, + "grad_norm": 0.07044408470392227, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3399 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.07732822746038437, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3400 + }, + { + "epoch": 2.716453674121406, + "grad_norm": 0.054326847195625305, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3401 + }, + { + "epoch": 2.7172523961661343, + "grad_norm": 0.041327398270368576, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3402 + }, + { + "epoch": 2.7180511182108624, + "grad_norm": 0.07147548347711563, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3403 + }, + { + "epoch": 2.718849840255591, + "grad_norm": 0.12999942898750305, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3404 + }, + { + "epoch": 2.7196485623003195, + "grad_norm": 0.18404515087604523, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3405 + }, + { + "epoch": 2.720447284345048, + "grad_norm": 0.1873377114534378, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3406 + }, + { + "epoch": 2.7212460063897765, + "grad_norm": 0.0732024610042572, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3407 + }, + { + "epoch": 2.722044728434505, + "grad_norm": 0.07602795958518982, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3408 + }, + { + "epoch": 2.722843450479233, + "grad_norm": 0.07871323823928833, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3409 + }, + { + "epoch": 2.7236421725239617, + "grad_norm": 0.0738302692770958, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3410 + }, + { + "epoch": 2.72444089456869, + "grad_norm": 0.12097286432981491, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3411 + }, + { + "epoch": 2.7252396166134183, + "grad_norm": 0.10136821120977402, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3412 + }, + { + "epoch": 2.726038338658147, + "grad_norm": 0.07281512022018433, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3413 + }, + { + "epoch": 2.7268370607028753, + "grad_norm": 0.09425969421863556, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3414 + }, + { + "epoch": 2.727635782747604, + "grad_norm": 0.11939436942338943, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3415 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 0.07181181758642197, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3416 + }, + { + "epoch": 2.729233226837061, + "grad_norm": 0.06634730845689774, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3417 + }, + { + "epoch": 2.730031948881789, + "grad_norm": 0.0941692590713501, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3418 + }, + { + "epoch": 2.7308306709265175, + "grad_norm": 0.10803452879190445, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3419 + }, + { + "epoch": 2.731629392971246, + "grad_norm": 0.08289305865764618, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3420 + }, + { + "epoch": 2.7324281150159746, + "grad_norm": 0.048421960324048996, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3421 + }, + { + "epoch": 2.7332268370607027, + "grad_norm": 0.09108635783195496, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3422 + }, + { + "epoch": 2.734025559105431, + "grad_norm": 0.13627508282661438, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3423 + }, + { + "epoch": 2.7348242811501597, + "grad_norm": 0.14651858806610107, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3424 + }, + { + "epoch": 2.7356230031948883, + "grad_norm": 0.126741424202919, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3425 + }, + { + "epoch": 2.736421725239617, + "grad_norm": 0.05885545164346695, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3426 + }, + { + "epoch": 2.737220447284345, + "grad_norm": 0.09471739828586578, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3427 + }, + { + "epoch": 2.7380191693290734, + "grad_norm": 0.18026123940944672, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3428 + }, + { + "epoch": 2.738817891373802, + "grad_norm": 0.1737871915102005, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3429 + }, + { + "epoch": 2.7396166134185305, + "grad_norm": 0.052994512021541595, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3430 + }, + { + "epoch": 2.7404153354632586, + "grad_norm": 0.13484452664852142, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 3431 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 0.2207227200269699, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3432 + }, + { + "epoch": 2.7420127795527156, + "grad_norm": 0.17741963267326355, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3433 + }, + { + "epoch": 2.742811501597444, + "grad_norm": 0.07451824843883514, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3434 + }, + { + "epoch": 2.7436102236421727, + "grad_norm": 0.07947403192520142, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3435 + }, + { + "epoch": 2.744408945686901, + "grad_norm": 0.11197762936353683, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3436 + }, + { + "epoch": 2.7452076677316293, + "grad_norm": 0.08398377895355225, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 3437 + }, + { + "epoch": 2.746006389776358, + "grad_norm": 0.03809420019388199, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3438 + }, + { + "epoch": 2.7468051118210863, + "grad_norm": 0.11537694931030273, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3439 + }, + { + "epoch": 2.747603833865815, + "grad_norm": 0.1537221372127533, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3440 + }, + { + "epoch": 2.748402555910543, + "grad_norm": 0.1132403165102005, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3441 + }, + { + "epoch": 2.7492012779552715, + "grad_norm": 0.038440920412540436, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3442 + }, + { + "epoch": 2.75, + "grad_norm": 0.10132595151662827, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3443 + }, + { + "epoch": 2.7507987220447285, + "grad_norm": 0.12446253001689911, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3444 + }, + { + "epoch": 2.751597444089457, + "grad_norm": 0.05364474281668663, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3445 + }, + { + "epoch": 2.752396166134185, + "grad_norm": 0.04705234244465828, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3446 + }, + { + "epoch": 2.7531948881789137, + "grad_norm": 0.10524975508451462, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3447 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 0.12036000937223434, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3448 + }, + { + "epoch": 2.7547923322683707, + "grad_norm": 0.08042819797992706, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3449 + }, + { + "epoch": 2.755591054313099, + "grad_norm": 0.04404102638363838, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3450 + }, + { + "epoch": 2.7563897763578273, + "grad_norm": 0.0766257792711258, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3451 + }, + { + "epoch": 2.757188498402556, + "grad_norm": 0.06359248608350754, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3452 + }, + { + "epoch": 2.7579872204472844, + "grad_norm": 0.06752901524305344, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3453 + }, + { + "epoch": 2.758785942492013, + "grad_norm": 0.12018375843763351, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3454 + }, + { + "epoch": 2.7595846645367414, + "grad_norm": 0.15904727578163147, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3455 + }, + { + "epoch": 2.7603833865814695, + "grad_norm": 0.12665021419525146, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3456 + }, + { + "epoch": 2.761182108626198, + "grad_norm": 0.07552342861890793, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3457 + }, + { + "epoch": 2.7619808306709266, + "grad_norm": 0.25927653908729553, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3458 + }, + { + "epoch": 2.762779552715655, + "grad_norm": 0.3487590253353119, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3459 + }, + { + "epoch": 2.763578274760383, + "grad_norm": 0.2783665359020233, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3460 + }, + { + "epoch": 2.7643769968051117, + "grad_norm": 0.054424334317445755, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3461 + }, + { + "epoch": 2.7651757188498403, + "grad_norm": 0.240921288728714, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3462 + }, + { + "epoch": 2.765974440894569, + "grad_norm": 0.3380962014198303, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3463 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.1514623463153839, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3464 + }, + { + "epoch": 2.7675718849840254, + "grad_norm": 0.15135464072227478, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3465 + }, + { + "epoch": 2.768370607028754, + "grad_norm": 0.262546181678772, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3466 + }, + { + "epoch": 2.7691693290734825, + "grad_norm": 0.11052273958921432, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3467 + }, + { + "epoch": 2.769968051118211, + "grad_norm": 0.14473804831504822, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3468 + }, + { + "epoch": 2.770766773162939, + "grad_norm": 0.24968142807483673, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3469 + }, + { + "epoch": 2.7715654952076676, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3470 + }, + { + "epoch": 2.772364217252396, + "grad_norm": 0.0957072302699089, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3471 + }, + { + "epoch": 2.7731629392971247, + "grad_norm": 0.2122000902891159, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3472 + }, + { + "epoch": 2.773961661341853, + "grad_norm": 0.15716226398944855, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3473 + }, + { + "epoch": 2.7747603833865817, + "grad_norm": 0.05107169970870018, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3474 + }, + { + "epoch": 2.77555910543131, + "grad_norm": 0.19824674725532532, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3475 + }, + { + "epoch": 2.7763578274760383, + "grad_norm": 0.16866235435009003, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3476 + }, + { + "epoch": 2.777156549520767, + "grad_norm": 0.03332412987947464, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3477 + }, + { + "epoch": 2.777955271565495, + "grad_norm": 0.1771237850189209, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3478 + }, + { + "epoch": 2.7787539936102235, + "grad_norm": 0.23501509428024292, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3479 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.0976579561829567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3480 + }, + { + "epoch": 2.7803514376996805, + "grad_norm": 0.11640458554029465, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3481 + }, + { + "epoch": 2.781150159744409, + "grad_norm": 0.2140960842370987, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3482 + }, + { + "epoch": 2.7819488817891376, + "grad_norm": 0.2055736929178238, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3483 + }, + { + "epoch": 2.7827476038338657, + "grad_norm": 0.09386937320232391, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 3484 + }, + { + "epoch": 2.783546325878594, + "grad_norm": 0.11534380912780762, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3485 + }, + { + "epoch": 2.7843450479233227, + "grad_norm": 0.19186711311340332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3486 + }, + { + "epoch": 2.7851437699680512, + "grad_norm": 0.26858124136924744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3487 + }, + { + "epoch": 2.7859424920127793, + "grad_norm": 0.05965370684862137, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 3488 + }, + { + "epoch": 2.786741214057508, + "grad_norm": 0.17804528772830963, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3489 + }, + { + "epoch": 2.7875399361022364, + "grad_norm": 0.1802065223455429, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 3490 + }, + { + "epoch": 2.788338658146965, + "grad_norm": 0.06634502857923508, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3491 + }, + { + "epoch": 2.7891373801916934, + "grad_norm": 0.06682102382183075, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3492 + }, + { + "epoch": 2.789936102236422, + "grad_norm": 0.08941584080457687, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3493 + }, + { + "epoch": 2.79073482428115, + "grad_norm": 0.06336037069559097, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3494 + }, + { + "epoch": 2.7915335463258786, + "grad_norm": 0.05562690272927284, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3495 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.10294149816036224, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3496 + }, + { + "epoch": 2.793130990415335, + "grad_norm": 0.11363442987203598, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3497 + }, + { + "epoch": 2.7939297124600637, + "grad_norm": 0.05790446698665619, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3498 + }, + { + "epoch": 2.7947284345047922, + "grad_norm": 0.09351370483636856, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3499 + }, + { + "epoch": 2.7955271565495208, + "grad_norm": 0.2225412130355835, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3500 + }, + { + "epoch": 2.7963258785942493, + "grad_norm": 0.21828165650367737, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3501 + }, + { + "epoch": 2.797124600638978, + "grad_norm": 0.06987733393907547, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 3502 + }, + { + "epoch": 2.797923322683706, + "grad_norm": 0.14518103003501892, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3503 + }, + { + "epoch": 2.7987220447284344, + "grad_norm": 0.24233761429786682, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3504 + }, + { + "epoch": 2.799520766773163, + "grad_norm": 0.19286365807056427, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3505 + }, + { + "epoch": 2.8003194888178915, + "grad_norm": 0.07981286942958832, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3506 + }, + { + "epoch": 2.8011182108626196, + "grad_norm": 0.050319187343120575, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3507 + }, + { + "epoch": 2.801916932907348, + "grad_norm": 0.09955406934022903, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3508 + }, + { + "epoch": 2.8027156549520766, + "grad_norm": 0.048427898436784744, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3509 + }, + { + "epoch": 2.803514376996805, + "grad_norm": 0.0805777907371521, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3510 + }, + { + "epoch": 2.8043130990415337, + "grad_norm": 0.07289621978998184, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3511 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.04940955713391304, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3512 + }, + { + "epoch": 2.8059105431309903, + "grad_norm": 0.07228294759988785, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3513 + }, + { + "epoch": 2.806709265175719, + "grad_norm": 0.06902103871107101, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3514 + }, + { + "epoch": 2.8075079872204474, + "grad_norm": 0.056301236152648926, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3515 + }, + { + "epoch": 2.8083067092651754, + "grad_norm": 0.03880859166383743, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3516 + }, + { + "epoch": 2.809105431309904, + "grad_norm": 0.04914811998605728, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3517 + }, + { + "epoch": 2.8099041533546325, + "grad_norm": 0.04139270633459091, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3518 + }, + { + "epoch": 2.810702875399361, + "grad_norm": 0.05118592828512192, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3519 + }, + { + "epoch": 2.8115015974440896, + "grad_norm": 0.03548616170883179, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 3520 + }, + { + "epoch": 2.812300319488818, + "grad_norm": 0.04883241280913353, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3521 + }, + { + "epoch": 2.813099041533546, + "grad_norm": 0.044492170214653015, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3522 + }, + { + "epoch": 2.8138977635782747, + "grad_norm": 0.050978366285562515, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3523 + }, + { + "epoch": 2.8146964856230032, + "grad_norm": 0.04663826525211334, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3524 + }, + { + "epoch": 2.8154952076677318, + "grad_norm": 0.06378154456615448, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3525 + }, + { + "epoch": 2.81629392971246, + "grad_norm": 0.06913618743419647, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3526 + }, + { + "epoch": 2.8170926517571884, + "grad_norm": 0.084662064909935, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3527 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 0.08352439105510712, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3528 + }, + { + "epoch": 2.8186900958466454, + "grad_norm": 0.07254189252853394, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3529 + }, + { + "epoch": 2.819488817891374, + "grad_norm": 0.04416285827755928, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 3530 + }, + { + "epoch": 2.8202875399361025, + "grad_norm": 0.056230951100587845, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3531 + }, + { + "epoch": 2.8210862619808306, + "grad_norm": 0.11055732518434525, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3532 + }, + { + "epoch": 2.821884984025559, + "grad_norm": 0.08660246431827545, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3533 + }, + { + "epoch": 2.8226837060702876, + "grad_norm": 0.0691947191953659, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3534 + }, + { + "epoch": 2.8234824281150157, + "grad_norm": 0.09254545718431473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3535 + }, + { + "epoch": 2.8242811501597442, + "grad_norm": 0.0663340613245964, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3536 + }, + { + "epoch": 2.8250798722044728, + "grad_norm": 0.05052514374256134, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3537 + }, + { + "epoch": 2.8258785942492013, + "grad_norm": 0.08364969491958618, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3538 + }, + { + "epoch": 2.82667731629393, + "grad_norm": 0.08269570767879486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3539 + }, + { + "epoch": 2.8274760383386583, + "grad_norm": 0.06289245933294296, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3540 + }, + { + "epoch": 2.8282747603833864, + "grad_norm": 0.03565627336502075, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3541 + }, + { + "epoch": 2.829073482428115, + "grad_norm": 0.057896651327610016, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3542 + }, + { + "epoch": 2.8298722044728435, + "grad_norm": 0.046379514038562775, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3543 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.06231336295604706, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3544 + }, + { + "epoch": 2.8314696485623, + "grad_norm": 0.03983502462506294, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3545 + }, + { + "epoch": 2.8322683706070286, + "grad_norm": 0.07364759594202042, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3546 + }, + { + "epoch": 2.833067092651757, + "grad_norm": 0.11596816778182983, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3547 + }, + { + "epoch": 2.8338658146964857, + "grad_norm": 0.10731378942728043, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3548 + }, + { + "epoch": 2.834664536741214, + "grad_norm": 0.06365050375461578, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3549 + }, + { + "epoch": 2.8354632587859427, + "grad_norm": 0.055451441556215286, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3550 + }, + { + "epoch": 2.836261980830671, + "grad_norm": 0.1490558534860611, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3551 + }, + { + "epoch": 2.8370607028753994, + "grad_norm": 0.1539796143770218, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3552 + }, + { + "epoch": 2.837859424920128, + "grad_norm": 0.06760501861572266, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3553 + }, + { + "epoch": 2.838658146964856, + "grad_norm": 0.0685611367225647, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 3554 + }, + { + "epoch": 2.8394568690095845, + "grad_norm": 0.14234358072280884, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3555 + }, + { + "epoch": 2.840255591054313, + "grad_norm": 0.14428865909576416, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3556 + }, + { + "epoch": 2.8410543130990416, + "grad_norm": 0.07594695687294006, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3557 + }, + { + "epoch": 2.84185303514377, + "grad_norm": 0.040841538459062576, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3558 + }, + { + "epoch": 2.8426517571884986, + "grad_norm": 0.04991824924945831, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3559 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 0.03846943378448486, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3560 + }, + { + "epoch": 2.844249201277955, + "grad_norm": 0.04851507395505905, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3561 + }, + { + "epoch": 2.8450479233226837, + "grad_norm": 0.0635538399219513, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3562 + }, + { + "epoch": 2.8458466453674123, + "grad_norm": 0.11812663078308105, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3563 + }, + { + "epoch": 2.8466453674121404, + "grad_norm": 0.05664098262786865, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3564 + }, + { + "epoch": 2.847444089456869, + "grad_norm": 0.03532585874199867, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3565 + }, + { + "epoch": 2.8482428115015974, + "grad_norm": 0.06758403033018112, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 3566 + }, + { + "epoch": 2.849041533546326, + "grad_norm": 0.06279300898313522, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3567 + }, + { + "epoch": 2.8498402555910545, + "grad_norm": 0.043967198580503464, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3568 + }, + { + "epoch": 2.850638977635783, + "grad_norm": 0.04900701716542244, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 3569 + }, + { + "epoch": 2.851437699680511, + "grad_norm": 0.07339311391115189, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3570 + }, + { + "epoch": 2.8522364217252396, + "grad_norm": 0.10644743591547012, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3571 + }, + { + "epoch": 2.853035143769968, + "grad_norm": 0.10544353723526001, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3572 + }, + { + "epoch": 2.8538338658146962, + "grad_norm": 0.0590951181948185, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3573 + }, + { + "epoch": 2.8546325878594248, + "grad_norm": 0.05038939788937569, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3574 + }, + { + "epoch": 2.8554313099041533, + "grad_norm": 0.06013040617108345, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3575 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.07330521196126938, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3576 + }, + { + "epoch": 2.8570287539936103, + "grad_norm": 0.12049853056669235, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3577 + }, + { + "epoch": 2.857827476038339, + "grad_norm": 0.13056780397891998, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3578 + }, + { + "epoch": 2.858626198083067, + "grad_norm": 0.12987029552459717, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3579 + }, + { + "epoch": 2.8594249201277955, + "grad_norm": 0.08681001514196396, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3580 + }, + { + "epoch": 2.860223642172524, + "grad_norm": 0.060947105288505554, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3581 + }, + { + "epoch": 2.8610223642172525, + "grad_norm": 0.10896368324756622, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3582 + }, + { + "epoch": 2.8618210862619806, + "grad_norm": 0.1251460760831833, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3583 + }, + { + "epoch": 2.862619808306709, + "grad_norm": 0.035174671560525894, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3584 + }, + { + "epoch": 2.8634185303514377, + "grad_norm": 0.12026303261518478, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3585 + }, + { + "epoch": 2.864217252396166, + "grad_norm": 0.16679063439369202, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3586 + }, + { + "epoch": 2.8650159744408947, + "grad_norm": 0.19229409098625183, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3587 + }, + { + "epoch": 2.8658146964856233, + "grad_norm": 0.17964699864387512, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3588 + }, + { + "epoch": 2.8666134185303513, + "grad_norm": 0.10671430081129074, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3589 + }, + { + "epoch": 2.86741214057508, + "grad_norm": 0.04453161358833313, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 3590 + }, + { + "epoch": 2.8682108626198084, + "grad_norm": 0.1531655639410019, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3591 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.19321779906749725, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3592 + }, + { + "epoch": 2.869808306709265, + "grad_norm": 0.19540782272815704, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3593 + }, + { + "epoch": 2.8706070287539935, + "grad_norm": 0.22210878133773804, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3594 + }, + { + "epoch": 2.871405750798722, + "grad_norm": 0.2089247703552246, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3595 + }, + { + "epoch": 2.8722044728434506, + "grad_norm": 0.11910446733236313, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3596 + }, + { + "epoch": 2.873003194888179, + "grad_norm": 0.05230247974395752, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3597 + }, + { + "epoch": 2.873801916932907, + "grad_norm": 0.09492263197898865, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3598 + }, + { + "epoch": 2.8746006389776357, + "grad_norm": 0.1396690160036087, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3599 + }, + { + "epoch": 2.8753993610223643, + "grad_norm": 0.12218718230724335, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3600 + }, + { + "epoch": 2.876198083067093, + "grad_norm": 0.05510007217526436, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 3601 + }, + { + "epoch": 2.876996805111821, + "grad_norm": 0.04949348792433739, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3602 + }, + { + "epoch": 2.8777955271565494, + "grad_norm": 0.06522537767887115, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3603 + }, + { + "epoch": 2.878594249201278, + "grad_norm": 0.034176018089056015, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3604 + }, + { + "epoch": 2.8793929712460065, + "grad_norm": 0.07579770684242249, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3605 + }, + { + "epoch": 2.880191693290735, + "grad_norm": 0.09512948244810104, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3606 + }, + { + "epoch": 2.8809904153354635, + "grad_norm": 0.059753213077783585, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3607 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.2461470365524292, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3608 + }, + { + "epoch": 2.88258785942492, + "grad_norm": 0.11298660188913345, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3609 + }, + { + "epoch": 2.8833865814696487, + "grad_norm": 0.20638997852802277, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3610 + }, + { + "epoch": 2.8841853035143767, + "grad_norm": 0.2394232600927353, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3611 + }, + { + "epoch": 2.8849840255591053, + "grad_norm": 0.15168963372707367, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3612 + }, + { + "epoch": 2.885782747603834, + "grad_norm": 0.03990825638175011, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3613 + }, + { + "epoch": 2.8865814696485623, + "grad_norm": 0.1725347936153412, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3614 + }, + { + "epoch": 2.887380191693291, + "grad_norm": 0.20821869373321533, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3615 + }, + { + "epoch": 2.8881789137380194, + "grad_norm": 0.14441269636154175, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3616 + }, + { + "epoch": 2.8889776357827475, + "grad_norm": 0.037162624299526215, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3617 + }, + { + "epoch": 2.889776357827476, + "grad_norm": 0.11550657451152802, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3618 + }, + { + "epoch": 2.8905750798722045, + "grad_norm": 0.15214277803897858, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3619 + }, + { + "epoch": 2.891373801916933, + "grad_norm": 0.09059946238994598, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3620 + }, + { + "epoch": 2.892172523961661, + "grad_norm": 0.03436599299311638, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3621 + }, + { + "epoch": 2.8929712460063897, + "grad_norm": 0.0839625746011734, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3622 + }, + { + "epoch": 2.893769968051118, + "grad_norm": 0.1618664264678955, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3623 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.08216597139835358, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3624 + }, + { + "epoch": 2.8953674121405752, + "grad_norm": 0.06303965300321579, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3625 + }, + { + "epoch": 2.8961661341853038, + "grad_norm": 0.050278183072805405, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3626 + }, + { + "epoch": 2.896964856230032, + "grad_norm": 0.04620242863893509, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3627 + }, + { + "epoch": 2.8977635782747604, + "grad_norm": 0.04937691614031792, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3628 + }, + { + "epoch": 2.898562300319489, + "grad_norm": 0.056928347796201706, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3629 + }, + { + "epoch": 2.899361022364217, + "grad_norm": 0.04932256042957306, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3630 + }, + { + "epoch": 2.9001597444089455, + "grad_norm": 0.04320303350687027, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3631 + }, + { + "epoch": 2.900958466453674, + "grad_norm": 0.08589868247509003, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3632 + }, + { + "epoch": 2.9017571884984026, + "grad_norm": 0.11458484083414078, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3633 + }, + { + "epoch": 2.902555910543131, + "grad_norm": 0.13549752533435822, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3634 + }, + { + "epoch": 2.9033546325878596, + "grad_norm": 0.1327086091041565, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3635 + }, + { + "epoch": 2.9041533546325877, + "grad_norm": 0.08295682817697525, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3636 + }, + { + "epoch": 2.9049520766773163, + "grad_norm": 0.05216526240110397, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3637 + }, + { + "epoch": 2.905750798722045, + "grad_norm": 0.11048691719770432, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3638 + }, + { + "epoch": 2.9065495207667733, + "grad_norm": 0.17681372165679932, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3639 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.16901300847530365, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3640 + }, + { + "epoch": 2.90814696485623, + "grad_norm": 0.10261020064353943, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3641 + }, + { + "epoch": 2.9089456869009584, + "grad_norm": 0.042478349059820175, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3642 + }, + { + "epoch": 2.909744408945687, + "grad_norm": 0.11727496981620789, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3643 + }, + { + "epoch": 2.9105431309904155, + "grad_norm": 0.14884977042675018, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3644 + }, + { + "epoch": 2.911341853035144, + "grad_norm": 0.047877270728349686, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3645 + }, + { + "epoch": 2.912140575079872, + "grad_norm": 0.11930714547634125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3646 + }, + { + "epoch": 2.9129392971246006, + "grad_norm": 0.1873956024646759, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3647 + }, + { + "epoch": 2.913738019169329, + "grad_norm": 0.22310249507427216, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3648 + }, + { + "epoch": 2.9145367412140573, + "grad_norm": 0.21259911358356476, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3649 + }, + { + "epoch": 2.915335463258786, + "grad_norm": 0.11584217846393585, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3650 + }, + { + "epoch": 2.9161341853035143, + "grad_norm": 0.04092720150947571, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3651 + }, + { + "epoch": 2.916932907348243, + "grad_norm": 0.14542047679424286, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3652 + }, + { + "epoch": 2.9177316293929714, + "grad_norm": 0.16328515112400055, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3653 + }, + { + "epoch": 2.9185303514377, + "grad_norm": 0.11284583806991577, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3654 + }, + { + "epoch": 2.919329073482428, + "grad_norm": 0.03723357245326042, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3655 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.1347448229789734, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3656 + }, + { + "epoch": 2.920926517571885, + "grad_norm": 0.1697797328233719, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3657 + }, + { + "epoch": 2.9217252396166136, + "grad_norm": 0.12122484296560287, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3658 + }, + { + "epoch": 2.9225239616613417, + "grad_norm": 0.043503791093826294, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3659 + }, + { + "epoch": 2.92332268370607, + "grad_norm": 0.1600242555141449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3660 + }, + { + "epoch": 2.9241214057507987, + "grad_norm": 0.21065576374530792, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3661 + }, + { + "epoch": 2.9249201277955272, + "grad_norm": 0.16726253926753998, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3662 + }, + { + "epoch": 2.9257188498402558, + "grad_norm": 0.09178615361452103, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3663 + }, + { + "epoch": 2.9265175718849843, + "grad_norm": 0.0447201170027256, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3664 + }, + { + "epoch": 2.9273162939297124, + "grad_norm": 0.10462333261966705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3665 + }, + { + "epoch": 2.928115015974441, + "grad_norm": 0.08236772567033768, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3666 + }, + { + "epoch": 2.9289137380191694, + "grad_norm": 0.06551375985145569, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3667 + }, + { + "epoch": 2.9297124600638975, + "grad_norm": 0.1531982123851776, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3668 + }, + { + "epoch": 2.930511182108626, + "grad_norm": 0.19483166933059692, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3669 + }, + { + "epoch": 2.9313099041533546, + "grad_norm": 0.12347809225320816, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3670 + }, + { + "epoch": 2.932108626198083, + "grad_norm": 0.05494467169046402, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3671 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 0.2280847579240799, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3672 + }, + { + "epoch": 2.93370607028754, + "grad_norm": 0.30344241857528687, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3673 + }, + { + "epoch": 2.9345047923322682, + "grad_norm": 0.243449404835701, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3674 + }, + { + "epoch": 2.9353035143769968, + "grad_norm": 0.11542543768882751, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3675 + }, + { + "epoch": 2.9361022364217253, + "grad_norm": 0.09501481056213379, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3676 + }, + { + "epoch": 2.936900958466454, + "grad_norm": 0.2299363762140274, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3677 + }, + { + "epoch": 2.937699680511182, + "grad_norm": 0.15020152926445007, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3678 + }, + { + "epoch": 2.9384984025559104, + "grad_norm": 0.0655093789100647, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3679 + }, + { + "epoch": 2.939297124600639, + "grad_norm": 0.15242713689804077, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3680 + }, + { + "epoch": 2.9400958466453675, + "grad_norm": 0.13315139710903168, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3681 + }, + { + "epoch": 2.940894568690096, + "grad_norm": 0.05966462939977646, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3682 + }, + { + "epoch": 2.9416932907348246, + "grad_norm": 0.08146806806325912, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3683 + }, + { + "epoch": 2.9424920127795526, + "grad_norm": 0.13615436851978302, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3684 + }, + { + "epoch": 2.943290734824281, + "grad_norm": 0.10889092832803726, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3685 + }, + { + "epoch": 2.9440894568690097, + "grad_norm": 0.03455124795436859, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3686 + }, + { + "epoch": 2.9448881789137378, + "grad_norm": 0.07490532845258713, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3687 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.08072194457054138, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3688 + }, + { + "epoch": 2.946485623003195, + "grad_norm": 0.03630111739039421, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3689 + }, + { + "epoch": 2.9472843450479234, + "grad_norm": 0.09075939655303955, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3690 + }, + { + "epoch": 2.948083067092652, + "grad_norm": 0.1618475615978241, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3691 + }, + { + "epoch": 2.9488817891373804, + "grad_norm": 0.18354517221450806, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3692 + }, + { + "epoch": 2.9496805111821085, + "grad_norm": 0.170358344912529, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3693 + }, + { + "epoch": 2.950479233226837, + "grad_norm": 0.10800250619649887, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3694 + }, + { + "epoch": 2.9512779552715656, + "grad_norm": 0.03771398589015007, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3695 + }, + { + "epoch": 2.952076677316294, + "grad_norm": 0.07931157946586609, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3696 + }, + { + "epoch": 2.952875399361022, + "grad_norm": 0.08149557560682297, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3697 + }, + { + "epoch": 2.9536741214057507, + "grad_norm": 0.05122899264097214, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3698 + }, + { + "epoch": 2.9544728434504792, + "grad_norm": 0.040845707058906555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3699 + }, + { + "epoch": 2.9552715654952078, + "grad_norm": 0.11444225907325745, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3700 + }, + { + "epoch": 2.9560702875399363, + "grad_norm": 0.20140959322452545, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3701 + }, + { + "epoch": 2.956869009584665, + "grad_norm": 0.24982111155986786, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3702 + }, + { + "epoch": 2.957667731629393, + "grad_norm": 0.21290510892868042, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3703 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.11526014655828476, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3704 + }, + { + "epoch": 2.95926517571885, + "grad_norm": 0.03769242390990257, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3705 + }, + { + "epoch": 2.960063897763578, + "grad_norm": 0.091837577521801, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3706 + }, + { + "epoch": 2.9608626198083066, + "grad_norm": 0.0956759825348854, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3707 + }, + { + "epoch": 2.961661341853035, + "grad_norm": 0.06945781409740448, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3708 + }, + { + "epoch": 2.9624600638977636, + "grad_norm": 0.03904029354453087, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3709 + }, + { + "epoch": 2.963258785942492, + "grad_norm": 0.1264238953590393, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3710 + }, + { + "epoch": 2.9640575079872207, + "grad_norm": 0.1689605861902237, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3711 + }, + { + "epoch": 2.9648562300319488, + "grad_norm": 0.15059368312358856, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3712 + }, + { + "epoch": 2.9656549520766773, + "grad_norm": 0.12976346909999847, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3713 + }, + { + "epoch": 2.966453674121406, + "grad_norm": 0.08460741490125656, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3714 + }, + { + "epoch": 2.9672523961661343, + "grad_norm": 0.04914790764451027, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3715 + }, + { + "epoch": 2.9680511182108624, + "grad_norm": 0.09629235416650772, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3716 + }, + { + "epoch": 2.968849840255591, + "grad_norm": 0.0895731970667839, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 3717 + }, + { + "epoch": 2.9696485623003195, + "grad_norm": 0.039528124034404755, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3718 + }, + { + "epoch": 2.970447284345048, + "grad_norm": 0.12843455374240875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3719 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.1754530519247055, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3720 + }, + { + "epoch": 2.972044728434505, + "grad_norm": 0.14169782400131226, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3721 + }, + { + "epoch": 2.972843450479233, + "grad_norm": 0.04416975378990173, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3722 + }, + { + "epoch": 2.9736421725239617, + "grad_norm": 0.1259031444787979, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3723 + }, + { + "epoch": 2.97444089456869, + "grad_norm": 0.17667949199676514, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3724 + }, + { + "epoch": 2.9752396166134183, + "grad_norm": 0.1213974729180336, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3725 + }, + { + "epoch": 2.976038338658147, + "grad_norm": 0.052554335445165634, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3726 + }, + { + "epoch": 2.9768370607028753, + "grad_norm": 0.13257208466529846, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3727 + }, + { + "epoch": 2.977635782747604, + "grad_norm": 0.1463504135608673, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3728 + }, + { + "epoch": 2.9784345047923324, + "grad_norm": 0.08546306937932968, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3729 + }, + { + "epoch": 2.979233226837061, + "grad_norm": 0.04226094111800194, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3730 + }, + { + "epoch": 2.980031948881789, + "grad_norm": 0.0924859419465065, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3731 + }, + { + "epoch": 2.9808306709265175, + "grad_norm": 0.1094423234462738, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3732 + }, + { + "epoch": 2.981629392971246, + "grad_norm": 0.11132006347179413, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3733 + }, + { + "epoch": 2.9824281150159746, + "grad_norm": 0.11010250449180603, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3734 + }, + { + "epoch": 2.9832268370607027, + "grad_norm": 0.10370460152626038, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3735 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 0.08460240811109543, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3736 + }, + { + "epoch": 2.9848242811501597, + "grad_norm": 0.06218400225043297, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3737 + }, + { + "epoch": 2.9856230031948883, + "grad_norm": 0.07446395605802536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3738 + }, + { + "epoch": 2.986421725239617, + "grad_norm": 0.06072726845741272, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3739 + }, + { + "epoch": 2.987220447284345, + "grad_norm": 0.07607559114694595, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3740 + }, + { + "epoch": 2.9880191693290734, + "grad_norm": 0.151380717754364, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3741 + }, + { + "epoch": 2.988817891373802, + "grad_norm": 0.24132277071475983, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3742 + }, + { + "epoch": 2.9896166134185305, + "grad_norm": 0.2346547245979309, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3743 + }, + { + "epoch": 2.9904153354632586, + "grad_norm": 0.090092234313488, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3744 + }, + { + "epoch": 2.991214057507987, + "grad_norm": 0.10230003297328949, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3745 + }, + { + "epoch": 2.9920127795527156, + "grad_norm": 0.17678654193878174, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3746 + }, + { + "epoch": 2.992811501597444, + "grad_norm": 0.16382110118865967, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3747 + }, + { + "epoch": 2.9936102236421727, + "grad_norm": 0.06456442922353745, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3748 + }, + { + "epoch": 2.994408945686901, + "grad_norm": 0.1774967759847641, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3749 + }, + { + "epoch": 2.9952076677316293, + "grad_norm": 0.19274447858333588, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3750 + }, + { + "epoch": 2.996006389776358, + "grad_norm": 0.10767998546361923, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3751 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.07864238321781158, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3752 + }, + { + "epoch": 2.997603833865815, + "grad_norm": 0.21339190006256104, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3753 + }, + { + "epoch": 2.998402555910543, + "grad_norm": 0.2560347616672516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3754 + }, + { + "epoch": 2.9992012779552715, + "grad_norm": 0.15730907022953033, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3755 + }, + { + "epoch": 3.0, + "grad_norm": 0.09766457974910736, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3756 + }, + { + "epoch": 3.0007987220447285, + "grad_norm": 0.24393433332443237, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3757 + }, + { + "epoch": 3.001597444089457, + "grad_norm": 0.17650263011455536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3758 + }, + { + "epoch": 3.002396166134185, + "grad_norm": 0.06490518152713776, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3759 + }, + { + "epoch": 3.0031948881789137, + "grad_norm": 0.10893388092517853, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3760 + }, + { + "epoch": 3.003993610223642, + "grad_norm": 0.13606922328472137, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3761 + }, + { + "epoch": 3.0047923322683707, + "grad_norm": 0.07880546152591705, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3762 + }, + { + "epoch": 3.0055910543130993, + "grad_norm": 0.04203686863183975, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3763 + }, + { + "epoch": 3.0063897763578273, + "grad_norm": 0.07509997487068176, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3764 + }, + { + "epoch": 3.007188498402556, + "grad_norm": 0.08529910445213318, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3765 + }, + { + "epoch": 3.0079872204472844, + "grad_norm": 0.05542825534939766, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3766 + }, + { + "epoch": 3.008785942492013, + "grad_norm": 0.08245155215263367, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3767 + }, + { + "epoch": 3.009584664536741, + "grad_norm": 0.09580255299806595, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3768 + }, + { + "epoch": 3.0103833865814695, + "grad_norm": 0.08233854174613953, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3769 + }, + { + "epoch": 3.011182108626198, + "grad_norm": 0.0589553639292717, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3770 + }, + { + "epoch": 3.0119808306709266, + "grad_norm": 0.09862494468688965, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3771 + }, + { + "epoch": 3.012779552715655, + "grad_norm": 0.1471278816461563, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3772 + }, + { + "epoch": 3.013578274760383, + "grad_norm": 0.1422986537218094, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3773 + }, + { + "epoch": 3.0143769968051117, + "grad_norm": 0.06627846509218216, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3774 + }, + { + "epoch": 3.0151757188498403, + "grad_norm": 0.04936077445745468, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3775 + }, + { + "epoch": 3.015974440894569, + "grad_norm": 0.0745953619480133, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3776 + }, + { + "epoch": 3.0167731629392973, + "grad_norm": 0.0725102499127388, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3777 + }, + { + "epoch": 3.0175718849840254, + "grad_norm": 0.04181717708706856, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3778 + }, + { + "epoch": 3.018370607028754, + "grad_norm": 0.09955357760190964, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3779 + }, + { + "epoch": 3.0191693290734825, + "grad_norm": 0.21014735102653503, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3780 + }, + { + "epoch": 3.019968051118211, + "grad_norm": 0.30597689747810364, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3781 + }, + { + "epoch": 3.0207667731629395, + "grad_norm": 0.2930602431297302, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3782 + }, + { + "epoch": 3.0215654952076676, + "grad_norm": 0.1190100908279419, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3783 + }, + { + "epoch": 3.022364217252396, + "grad_norm": 0.0655524879693985, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3784 + }, + { + "epoch": 3.0231629392971247, + "grad_norm": 0.12062554061412811, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3785 + }, + { + "epoch": 3.023961661341853, + "grad_norm": 0.09680327773094177, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3786 + }, + { + "epoch": 3.0247603833865813, + "grad_norm": 0.0555860660970211, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3787 + }, + { + "epoch": 3.02555910543131, + "grad_norm": 0.1271962672472, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3788 + }, + { + "epoch": 3.0263578274760383, + "grad_norm": 0.12178758531808853, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3789 + }, + { + "epoch": 3.027156549520767, + "grad_norm": 0.09623143821954727, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3790 + }, + { + "epoch": 3.0279552715654954, + "grad_norm": 0.04004101827740669, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3791 + }, + { + "epoch": 3.0287539936102235, + "grad_norm": 0.14001014828681946, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3792 + }, + { + "epoch": 3.029552715654952, + "grad_norm": 0.24241770803928375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3793 + }, + { + "epoch": 3.0303514376996805, + "grad_norm": 0.29141902923583984, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3794 + }, + { + "epoch": 3.031150159744409, + "grad_norm": 0.22814971208572388, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 3795 + }, + { + "epoch": 3.0319488817891376, + "grad_norm": 0.08114828914403915, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3796 + }, + { + "epoch": 3.0327476038338657, + "grad_norm": 0.08104736357927322, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3797 + }, + { + "epoch": 3.033546325878594, + "grad_norm": 0.12007702887058258, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3798 + }, + { + "epoch": 3.0343450479233227, + "grad_norm": 0.06497872620820999, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3799 + }, + { + "epoch": 3.0351437699680512, + "grad_norm": 0.07407233864068985, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3800 + }, + { + "epoch": 3.0359424920127798, + "grad_norm": 0.16386932134628296, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3801 + }, + { + "epoch": 3.036741214057508, + "grad_norm": 0.21633599698543549, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3802 + }, + { + "epoch": 3.0375399361022364, + "grad_norm": 0.19224147498607635, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3803 + }, + { + "epoch": 3.038338658146965, + "grad_norm": 0.04962728172540665, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3804 + }, + { + "epoch": 3.0391373801916934, + "grad_norm": 0.17984353005886078, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3805 + }, + { + "epoch": 3.0399361022364215, + "grad_norm": 0.31483346223831177, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3806 + }, + { + "epoch": 3.04073482428115, + "grad_norm": 0.27175095677375793, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3807 + }, + { + "epoch": 3.0415335463258786, + "grad_norm": 0.06302175670862198, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3808 + }, + { + "epoch": 3.042332268370607, + "grad_norm": 0.18620255589485168, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3809 + }, + { + "epoch": 3.0431309904153356, + "grad_norm": 0.23254868388175964, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3810 + }, + { + "epoch": 3.0439297124600637, + "grad_norm": 0.08763844519853592, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3811 + }, + { + "epoch": 3.0447284345047922, + "grad_norm": 0.13173392415046692, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3812 + }, + { + "epoch": 3.0455271565495208, + "grad_norm": 0.24171577394008636, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3813 + }, + { + "epoch": 3.0463258785942493, + "grad_norm": 0.17649634182453156, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3814 + }, + { + "epoch": 3.047124600638978, + "grad_norm": 0.03800780326128006, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3815 + }, + { + "epoch": 3.047923322683706, + "grad_norm": 0.20039476454257965, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3816 + }, + { + "epoch": 3.0487220447284344, + "grad_norm": 0.26794761419296265, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 3817 + }, + { + "epoch": 3.049520766773163, + "grad_norm": 0.18026290833950043, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3818 + }, + { + "epoch": 3.0503194888178915, + "grad_norm": 0.07298897206783295, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 3819 + }, + { + "epoch": 3.0511182108626196, + "grad_norm": 0.11078597605228424, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 3820 + }, + { + "epoch": 3.051916932907348, + "grad_norm": 0.13672129809856415, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3821 + }, + { + "epoch": 3.0527156549520766, + "grad_norm": 0.11172370612621307, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3822 + }, + { + "epoch": 3.053514376996805, + "grad_norm": 0.09000302106142044, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3823 + }, + { + "epoch": 3.0543130990415337, + "grad_norm": 0.055291030555963516, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3824 + }, + { + "epoch": 3.055111821086262, + "grad_norm": 0.05691349133849144, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3825 + }, + { + "epoch": 3.0559105431309903, + "grad_norm": 0.0744122862815857, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3826 + }, + { + "epoch": 3.056709265175719, + "grad_norm": 0.06438847631216049, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3827 + }, + { + "epoch": 3.0575079872204474, + "grad_norm": 0.0926717221736908, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3828 + }, + { + "epoch": 3.058306709265176, + "grad_norm": 0.15286727249622345, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3829 + }, + { + "epoch": 3.059105431309904, + "grad_norm": 0.2049989253282547, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3830 + }, + { + "epoch": 3.0599041533546325, + "grad_norm": 0.1832154393196106, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3831 + }, + { + "epoch": 3.060702875399361, + "grad_norm": 0.0953374058008194, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3832 + }, + { + "epoch": 3.0615015974440896, + "grad_norm": 0.063878633081913, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3833 + }, + { + "epoch": 3.062300319488818, + "grad_norm": 0.17062409222126007, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3834 + }, + { + "epoch": 3.063099041533546, + "grad_norm": 0.23467828333377838, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3835 + }, + { + "epoch": 3.0638977635782747, + "grad_norm": 0.19458062946796417, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3836 + }, + { + "epoch": 3.0646964856230032, + "grad_norm": 0.06614453345537186, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3837 + }, + { + "epoch": 3.0654952076677318, + "grad_norm": 0.1250256896018982, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3838 + }, + { + "epoch": 3.06629392971246, + "grad_norm": 0.2399163395166397, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3839 + }, + { + "epoch": 3.0670926517571884, + "grad_norm": 0.22544947266578674, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3840 + }, + { + "epoch": 3.067891373801917, + "grad_norm": 0.0710826963186264, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3841 + }, + { + "epoch": 3.0686900958466454, + "grad_norm": 0.12259501218795776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3842 + }, + { + "epoch": 3.069488817891374, + "grad_norm": 0.1313357651233673, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3843 + }, + { + "epoch": 3.070287539936102, + "grad_norm": 0.05492740869522095, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3844 + }, + { + "epoch": 3.0710862619808306, + "grad_norm": 0.08860959857702255, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3845 + }, + { + "epoch": 3.071884984025559, + "grad_norm": 0.12556305527687073, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3846 + }, + { + "epoch": 3.0726837060702876, + "grad_norm": 0.10780923813581467, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3847 + }, + { + "epoch": 3.073482428115016, + "grad_norm": 0.0587402880191803, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3848 + }, + { + "epoch": 3.0742811501597442, + "grad_norm": 0.06155085563659668, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3849 + }, + { + "epoch": 3.0750798722044728, + "grad_norm": 0.07258733361959457, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3850 + }, + { + "epoch": 3.0758785942492013, + "grad_norm": 0.060939520597457886, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3851 + }, + { + "epoch": 3.07667731629393, + "grad_norm": 0.07125407457351685, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3852 + }, + { + "epoch": 3.0774760383386583, + "grad_norm": 0.15338753163814545, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3853 + }, + { + "epoch": 3.0782747603833864, + "grad_norm": 0.18328991532325745, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3854 + }, + { + "epoch": 3.079073482428115, + "grad_norm": 0.1338629275560379, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3855 + }, + { + "epoch": 3.0798722044728435, + "grad_norm": 0.042017024010419846, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3856 + }, + { + "epoch": 3.080670926517572, + "grad_norm": 0.13696196675300598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3857 + }, + { + "epoch": 3.0814696485623, + "grad_norm": 0.17552919685840607, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3858 + }, + { + "epoch": 3.0822683706070286, + "grad_norm": 0.09906235337257385, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3859 + }, + { + "epoch": 3.083067092651757, + "grad_norm": 0.057398926466703415, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3860 + }, + { + "epoch": 3.0838658146964857, + "grad_norm": 0.12260781973600388, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3861 + }, + { + "epoch": 3.084664536741214, + "grad_norm": 0.12672549486160278, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3862 + }, + { + "epoch": 3.0854632587859423, + "grad_norm": 0.07239031046628952, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3863 + }, + { + "epoch": 3.086261980830671, + "grad_norm": 0.0928259864449501, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3864 + }, + { + "epoch": 3.0870607028753994, + "grad_norm": 0.2161056250333786, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3865 + }, + { + "epoch": 3.087859424920128, + "grad_norm": 0.21302388608455658, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3866 + }, + { + "epoch": 3.0886581469648564, + "grad_norm": 0.10730110853910446, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3867 + }, + { + "epoch": 3.0894568690095845, + "grad_norm": 0.06801975518465042, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3868 + }, + { + "epoch": 3.090255591054313, + "grad_norm": 0.09036632627248764, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3869 + }, + { + "epoch": 3.0910543130990416, + "grad_norm": 0.1344052255153656, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3870 + }, + { + "epoch": 3.09185303514377, + "grad_norm": 0.10774482041597366, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3871 + }, + { + "epoch": 3.0926517571884986, + "grad_norm": 0.06824023276567459, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3872 + }, + { + "epoch": 3.0934504792332267, + "grad_norm": 0.11959507316350937, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3873 + }, + { + "epoch": 3.094249201277955, + "grad_norm": 0.14943768084049225, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3874 + }, + { + "epoch": 3.0950479233226837, + "grad_norm": 0.13593481481075287, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3875 + }, + { + "epoch": 3.0958466453674123, + "grad_norm": 0.06872473657131195, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3876 + }, + { + "epoch": 3.0966453674121404, + "grad_norm": 0.07243353873491287, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3877 + }, + { + "epoch": 3.097444089456869, + "grad_norm": 0.07884293049573898, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3878 + }, + { + "epoch": 3.0982428115015974, + "grad_norm": 0.09574474394321442, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3879 + }, + { + "epoch": 3.099041533546326, + "grad_norm": 0.09028270840644836, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3880 + }, + { + "epoch": 3.0998402555910545, + "grad_norm": 0.056680940091609955, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3881 + }, + { + "epoch": 3.1006389776357826, + "grad_norm": 0.13817615807056427, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3882 + }, + { + "epoch": 3.101437699680511, + "grad_norm": 0.16102705895900726, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3883 + }, + { + "epoch": 3.1022364217252396, + "grad_norm": 0.08887791633605957, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3884 + }, + { + "epoch": 3.103035143769968, + "grad_norm": 0.055100735276937485, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3885 + }, + { + "epoch": 3.1038338658146967, + "grad_norm": 0.10710839927196503, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3886 + }, + { + "epoch": 3.1046325878594248, + "grad_norm": 0.09228713810443878, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3887 + }, + { + "epoch": 3.1054313099041533, + "grad_norm": 0.04602783918380737, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3888 + }, + { + "epoch": 3.106230031948882, + "grad_norm": 0.03584764152765274, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3889 + }, + { + "epoch": 3.1070287539936103, + "grad_norm": 0.04486532881855965, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3890 + }, + { + "epoch": 3.107827476038339, + "grad_norm": 0.036488354206085205, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3891 + }, + { + "epoch": 3.108626198083067, + "grad_norm": 0.04213477671146393, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3892 + }, + { + "epoch": 3.1094249201277955, + "grad_norm": 0.03840509057044983, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 3893 + }, + { + "epoch": 3.110223642172524, + "grad_norm": 0.04800419509410858, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3894 + }, + { + "epoch": 3.1110223642172525, + "grad_norm": 0.06467507034540176, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3895 + }, + { + "epoch": 3.1118210862619806, + "grad_norm": 0.05736416578292847, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3896 + }, + { + "epoch": 3.112619808306709, + "grad_norm": 0.03337489813566208, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3897 + }, + { + "epoch": 3.1134185303514377, + "grad_norm": 0.088229238986969, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3898 + }, + { + "epoch": 3.114217252396166, + "grad_norm": 0.1492392122745514, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3899 + }, + { + "epoch": 3.1150159744408947, + "grad_norm": 0.1699269413948059, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3900 + }, + { + "epoch": 3.115814696485623, + "grad_norm": 0.11532948911190033, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3901 + }, + { + "epoch": 3.1166134185303513, + "grad_norm": 0.030054764822125435, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3902 + }, + { + "epoch": 3.11741214057508, + "grad_norm": 0.11079075932502747, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 3903 + }, + { + "epoch": 3.1182108626198084, + "grad_norm": 0.15733082592487335, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3904 + }, + { + "epoch": 3.119009584664537, + "grad_norm": 0.12520034611225128, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 3905 + }, + { + "epoch": 3.119808306709265, + "grad_norm": 0.03382280096411705, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3906 + }, + { + "epoch": 3.1206070287539935, + "grad_norm": 0.11951576173305511, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3907 + }, + { + "epoch": 3.121405750798722, + "grad_norm": 0.2123839259147644, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3908 + }, + { + "epoch": 3.1222044728434506, + "grad_norm": 0.15437674522399902, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3909 + }, + { + "epoch": 3.123003194888179, + "grad_norm": 0.06463608890771866, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3910 + }, + { + "epoch": 3.123801916932907, + "grad_norm": 0.10830746591091156, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3911 + }, + { + "epoch": 3.1246006389776357, + "grad_norm": 0.17621003091335297, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3912 + }, + { + "epoch": 3.1253993610223643, + "grad_norm": 0.12417379021644592, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3913 + }, + { + "epoch": 3.126198083067093, + "grad_norm": 0.05364898219704628, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3914 + }, + { + "epoch": 3.126996805111821, + "grad_norm": 0.17589502036571503, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3915 + }, + { + "epoch": 3.1277955271565494, + "grad_norm": 0.249656081199646, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3916 + }, + { + "epoch": 3.128594249201278, + "grad_norm": 0.1800973266363144, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3917 + }, + { + "epoch": 3.1293929712460065, + "grad_norm": 0.09763745218515396, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3918 + }, + { + "epoch": 3.130191693290735, + "grad_norm": 0.10953835397958755, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3919 + }, + { + "epoch": 3.130990415335463, + "grad_norm": 0.17490456998348236, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3920 + }, + { + "epoch": 3.1317891373801916, + "grad_norm": 0.11533153057098389, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3921 + }, + { + "epoch": 3.13258785942492, + "grad_norm": 0.07494231313467026, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3922 + }, + { + "epoch": 3.1333865814696487, + "grad_norm": 0.14954763650894165, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3923 + }, + { + "epoch": 3.134185303514377, + "grad_norm": 0.18061646819114685, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3924 + }, + { + "epoch": 3.1349840255591053, + "grad_norm": 0.10419650375843048, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3925 + }, + { + "epoch": 3.135782747603834, + "grad_norm": 0.04677566513419151, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3926 + }, + { + "epoch": 3.1365814696485623, + "grad_norm": 0.12846903502941132, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3927 + }, + { + "epoch": 3.137380191693291, + "grad_norm": 0.11824795603752136, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3928 + }, + { + "epoch": 3.1381789137380194, + "grad_norm": 0.04194530099630356, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3929 + }, + { + "epoch": 3.1389776357827475, + "grad_norm": 0.15154412388801575, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3930 + }, + { + "epoch": 3.139776357827476, + "grad_norm": 0.19073615968227386, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3931 + }, + { + "epoch": 3.1405750798722045, + "grad_norm": 0.12614648044109344, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3932 + }, + { + "epoch": 3.141373801916933, + "grad_norm": 0.03434520214796066, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3933 + }, + { + "epoch": 3.142172523961661, + "grad_norm": 0.11913489550352097, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3934 + }, + { + "epoch": 3.1429712460063897, + "grad_norm": 0.16297172009944916, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3935 + }, + { + "epoch": 3.143769968051118, + "grad_norm": 0.15605789422988892, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3936 + }, + { + "epoch": 3.1445686900958467, + "grad_norm": 0.10524406284093857, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3937 + }, + { + "epoch": 3.1453674121405752, + "grad_norm": 0.03763152286410332, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3938 + }, + { + "epoch": 3.1461661341853033, + "grad_norm": 0.07586465775966644, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3939 + }, + { + "epoch": 3.146964856230032, + "grad_norm": 0.14553581178188324, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3940 + }, + { + "epoch": 3.1477635782747604, + "grad_norm": 0.1883595883846283, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3941 + }, + { + "epoch": 3.148562300319489, + "grad_norm": 0.13018599152565002, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3942 + }, + { + "epoch": 3.1493610223642174, + "grad_norm": 0.05356704071164131, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3943 + }, + { + "epoch": 3.1501597444089455, + "grad_norm": 0.2083088606595993, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3944 + }, + { + "epoch": 3.150958466453674, + "grad_norm": 0.2586681544780731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3945 + }, + { + "epoch": 3.1517571884984026, + "grad_norm": 0.18733063340187073, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3946 + }, + { + "epoch": 3.152555910543131, + "grad_norm": 0.03741752356290817, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3947 + }, + { + "epoch": 3.1533546325878596, + "grad_norm": 0.11660216003656387, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3948 + }, + { + "epoch": 3.1541533546325877, + "grad_norm": 0.12698383629322052, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3949 + }, + { + "epoch": 3.1549520766773163, + "grad_norm": 0.10244922339916229, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3950 + }, + { + "epoch": 3.155750798722045, + "grad_norm": 0.03815237060189247, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3951 + }, + { + "epoch": 3.1565495207667733, + "grad_norm": 0.04394761845469475, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3952 + }, + { + "epoch": 3.1573482428115014, + "grad_norm": 0.1344541311264038, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3953 + }, + { + "epoch": 3.15814696485623, + "grad_norm": 0.23006947338581085, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3954 + }, + { + "epoch": 3.1589456869009584, + "grad_norm": 0.2667021155357361, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3955 + }, + { + "epoch": 3.159744408945687, + "grad_norm": 0.2410362809896469, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3956 + }, + { + "epoch": 3.1605431309904155, + "grad_norm": 0.1421661078929901, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3957 + }, + { + "epoch": 3.1613418530351436, + "grad_norm": 0.04178561642765999, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3958 + }, + { + "epoch": 3.162140575079872, + "grad_norm": 0.15327088534832, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3959 + }, + { + "epoch": 3.1629392971246006, + "grad_norm": 0.1372532993555069, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3960 + }, + { + "epoch": 3.163738019169329, + "grad_norm": 0.03763817250728607, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3961 + }, + { + "epoch": 3.1645367412140577, + "grad_norm": 0.13227587938308716, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3962 + }, + { + "epoch": 3.165335463258786, + "grad_norm": 0.1952073723077774, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3963 + }, + { + "epoch": 3.1661341853035143, + "grad_norm": 0.1672048568725586, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3964 + }, + { + "epoch": 3.166932907348243, + "grad_norm": 0.09593698382377625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3965 + }, + { + "epoch": 3.1677316293929714, + "grad_norm": 0.03619454428553581, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3966 + }, + { + "epoch": 3.1685303514377, + "grad_norm": 0.05974683538079262, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3967 + }, + { + "epoch": 3.169329073482428, + "grad_norm": 0.09733424335718155, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3968 + }, + { + "epoch": 3.1701277955271565, + "grad_norm": 0.07536087185144424, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3969 + }, + { + "epoch": 3.170926517571885, + "grad_norm": 0.04263869300484657, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3970 + }, + { + "epoch": 3.1717252396166136, + "grad_norm": 0.040521468967199326, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3971 + }, + { + "epoch": 3.1725239616613417, + "grad_norm": 0.05615096539258957, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3972 + }, + { + "epoch": 3.17332268370607, + "grad_norm": 0.06655194610357285, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3973 + }, + { + "epoch": 3.1741214057507987, + "grad_norm": 0.07300302386283875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3974 + }, + { + "epoch": 3.1749201277955272, + "grad_norm": 0.04789174720644951, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3975 + }, + { + "epoch": 3.1757188498402558, + "grad_norm": 0.03460157290101051, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3976 + }, + { + "epoch": 3.176517571884984, + "grad_norm": 0.0393557995557785, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3977 + }, + { + "epoch": 3.1773162939297124, + "grad_norm": 0.062453389167785645, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3978 + }, + { + "epoch": 3.178115015974441, + "grad_norm": 0.08542043715715408, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3979 + }, + { + "epoch": 3.1789137380191694, + "grad_norm": 0.08002828061580658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3980 + }, + { + "epoch": 3.179712460063898, + "grad_norm": 0.04635196551680565, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3981 + }, + { + "epoch": 3.180511182108626, + "grad_norm": 0.09583642333745956, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3982 + }, + { + "epoch": 3.1813099041533546, + "grad_norm": 0.12418454885482788, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3983 + }, + { + "epoch": 3.182108626198083, + "grad_norm": 0.10457618534564972, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3984 + }, + { + "epoch": 3.1829073482428116, + "grad_norm": 0.07183804363012314, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3985 + }, + { + "epoch": 3.18370607028754, + "grad_norm": 0.039956409484148026, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3986 + }, + { + "epoch": 3.1845047923322682, + "grad_norm": 0.0884016826748848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3987 + }, + { + "epoch": 3.1853035143769968, + "grad_norm": 0.112494558095932, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3988 + }, + { + "epoch": 3.1861022364217253, + "grad_norm": 0.07582054287195206, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3989 + }, + { + "epoch": 3.186900958466454, + "grad_norm": 0.060303278267383575, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3990 + }, + { + "epoch": 3.187699680511182, + "grad_norm": 0.048326775431632996, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3991 + }, + { + "epoch": 3.1884984025559104, + "grad_norm": 0.32322436571121216, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3992 + }, + { + "epoch": 3.189297124600639, + "grad_norm": 0.5569815039634705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3993 + }, + { + "epoch": 3.1900958466453675, + "grad_norm": 0.7590563893318176, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 3994 + }, + { + "epoch": 3.190894568690096, + "grad_norm": 0.6537879705429077, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3995 + }, + { + "epoch": 3.191693290734824, + "grad_norm": 0.16556645929813385, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3996 + }, + { + "epoch": 3.1924920127795526, + "grad_norm": 0.3745940625667572, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3997 + }, + { + "epoch": 3.193290734824281, + "grad_norm": 0.5159009695053101, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 3998 + }, + { + "epoch": 3.1940894568690097, + "grad_norm": 0.1302756816148758, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3999 + }, + { + "epoch": 3.194888178913738, + "grad_norm": 0.3484213054180145, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4000 + }, + { + "epoch": 3.1956869009584663, + "grad_norm": 0.23763029277324677, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4001 + }, + { + "epoch": 3.196485623003195, + "grad_norm": 0.20648746192455292, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4002 + }, + { + "epoch": 3.1972843450479234, + "grad_norm": 0.31230399012565613, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4003 + }, + { + "epoch": 3.198083067092652, + "grad_norm": 0.15389247238636017, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4004 + }, + { + "epoch": 3.1988817891373804, + "grad_norm": 0.6544334292411804, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4005 + }, + { + "epoch": 3.1996805111821085, + "grad_norm": 0.5409669280052185, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4006 + }, + { + "epoch": 3.200479233226837, + "grad_norm": 0.11126074194908142, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4007 + }, + { + "epoch": 3.2012779552715656, + "grad_norm": 0.3257724642753601, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4008 + }, + { + "epoch": 3.202076677316294, + "grad_norm": 0.4188903272151947, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4009 + }, + { + "epoch": 3.202875399361022, + "grad_norm": 0.1012830138206482, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4010 + }, + { + "epoch": 3.2036741214057507, + "grad_norm": 0.2771216034889221, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4011 + }, + { + "epoch": 3.2044728434504792, + "grad_norm": 0.2873278260231018, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4012 + }, + { + "epoch": 3.2052715654952078, + "grad_norm": 0.09620041400194168, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4013 + }, + { + "epoch": 3.2060702875399363, + "grad_norm": 0.10561787337064743, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4014 + }, + { + "epoch": 3.2068690095846644, + "grad_norm": 0.12499046325683594, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4015 + }, + { + "epoch": 3.207667731629393, + "grad_norm": 0.4055064916610718, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4016 + }, + { + "epoch": 3.2084664536741214, + "grad_norm": 0.9722099900245667, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 4017 + }, + { + "epoch": 3.20926517571885, + "grad_norm": 0.7367122173309326, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 4018 + }, + { + "epoch": 3.2100638977635785, + "grad_norm": 0.4455755650997162, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4019 + }, + { + "epoch": 3.2108626198083066, + "grad_norm": 0.10350961983203888, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4020 + }, + { + "epoch": 3.211661341853035, + "grad_norm": 0.41901662945747375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4021 + }, + { + "epoch": 3.2124600638977636, + "grad_norm": 0.5987749695777893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4022 + }, + { + "epoch": 3.213258785942492, + "grad_norm": 1.5967272520065308, + "learning_rate": 0.0005, + "loss": 1.1938, + "step": 4023 + }, + { + "epoch": 3.2140575079872207, + "grad_norm": 3.289113759994507, + "learning_rate": 0.0005, + "loss": 1.2474, + "step": 4024 + }, + { + "epoch": 3.2148562300319488, + "grad_norm": 0.40220701694488525, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 4025 + }, + { + "epoch": 3.2156549520766773, + "grad_norm": 0.15129008889198303, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4026 + }, + { + "epoch": 3.216453674121406, + "grad_norm": 19.060272216796875, + "learning_rate": 0.0005, + "loss": 1.4668, + "step": 4027 + }, + { + "epoch": 3.2172523961661343, + "grad_norm": 1.72987961769104, + "learning_rate": 0.0005, + "loss": 1.3675, + "step": 4028 + }, + { + "epoch": 3.2180511182108624, + "grad_norm": 2.1064836978912354, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 4029 + }, + { + "epoch": 3.218849840255591, + "grad_norm": 1.0206952095031738, + "learning_rate": 0.0005, + "loss": 1.2602, + "step": 4030 + }, + { + "epoch": 3.2196485623003195, + "grad_norm": 14.109564781188965, + "learning_rate": 0.0005, + "loss": 3.9831, + "step": 4031 + }, + { + "epoch": 3.220447284345048, + "grad_norm": 12.518637657165527, + "learning_rate": 0.0005, + "loss": 3.4388, + "step": 4032 + }, + { + "epoch": 3.2212460063897765, + "grad_norm": 4.156238079071045, + "learning_rate": 0.0005, + "loss": 2.1713, + "step": 4033 + }, + { + "epoch": 3.2220447284345046, + "grad_norm": 2.752128839492798, + "learning_rate": 0.0005, + "loss": 1.6581, + "step": 4034 + }, + { + "epoch": 3.222843450479233, + "grad_norm": 5.876696586608887, + "learning_rate": 0.0005, + "loss": 2.1698, + "step": 4035 + }, + { + "epoch": 3.2236421725239617, + "grad_norm": 7.60305118560791, + "learning_rate": 0.0005, + "loss": 3.0713, + "step": 4036 + }, + { + "epoch": 3.22444089456869, + "grad_norm": 2.581448554992676, + "learning_rate": 0.0005, + "loss": 1.7677, + "step": 4037 + }, + { + "epoch": 3.2252396166134187, + "grad_norm": 1.0544116497039795, + "learning_rate": 0.0005, + "loss": 1.4604, + "step": 4038 + }, + { + "epoch": 3.226038338658147, + "grad_norm": 10.742961883544922, + "learning_rate": 0.0005, + "loss": 3.8634, + "step": 4039 + }, + { + "epoch": 3.2268370607028753, + "grad_norm": 6.555435657501221, + "learning_rate": 0.0005, + "loss": 2.7229, + "step": 4040 + }, + { + "epoch": 3.227635782747604, + "grad_norm": 4.335379600524902, + "learning_rate": 0.0005, + "loss": 2.548, + "step": 4041 + }, + { + "epoch": 3.2284345047923324, + "grad_norm": 3.9863200187683105, + "learning_rate": 0.0005, + "loss": 2.5051, + "step": 4042 + }, + { + "epoch": 3.229233226837061, + "grad_norm": 3.4922895431518555, + "learning_rate": 0.0005, + "loss": 2.1996, + "step": 4043 + }, + { + "epoch": 3.230031948881789, + "grad_norm": 0.9404768347740173, + "learning_rate": 0.0005, + "loss": 1.7869, + "step": 4044 + }, + { + "epoch": 3.2308306709265175, + "grad_norm": 1.2953938245773315, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 4045 + }, + { + "epoch": 3.231629392971246, + "grad_norm": 2.0215165615081787, + "learning_rate": 0.0005, + "loss": 1.9429, + "step": 4046 + }, + { + "epoch": 3.2324281150159746, + "grad_norm": 1.2744032144546509, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 4047 + }, + { + "epoch": 3.2332268370607027, + "grad_norm": 2.042656660079956, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 4048 + }, + { + "epoch": 3.234025559105431, + "grad_norm": 6.607172012329102, + "learning_rate": 0.0005, + "loss": 2.8381, + "step": 4049 + }, + { + "epoch": 3.2348242811501597, + "grad_norm": 1.2499932050704956, + "learning_rate": 0.0005, + "loss": 1.6324, + "step": 4050 + }, + { + "epoch": 3.2356230031948883, + "grad_norm": 1.1896424293518066, + "learning_rate": 0.0005, + "loss": 1.7201, + "step": 4051 + }, + { + "epoch": 3.236421725239617, + "grad_norm": 1.9901418685913086, + "learning_rate": 0.0005, + "loss": 1.7335, + "step": 4052 + }, + { + "epoch": 3.237220447284345, + "grad_norm": 0.8886330127716064, + "learning_rate": 0.0005, + "loss": 1.5111, + "step": 4053 + }, + { + "epoch": 3.2380191693290734, + "grad_norm": 2.6570353507995605, + "learning_rate": 0.0005, + "loss": 1.8628, + "step": 4054 + }, + { + "epoch": 3.238817891373802, + "grad_norm": 2.212905168533325, + "learning_rate": 0.0005, + "loss": 1.5838, + "step": 4055 + }, + { + "epoch": 3.2396166134185305, + "grad_norm": 3.1234660148620605, + "learning_rate": 0.0005, + "loss": 1.7212, + "step": 4056 + }, + { + "epoch": 3.2404153354632586, + "grad_norm": 0.9168338775634766, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 4057 + }, + { + "epoch": 3.241214057507987, + "grad_norm": 0.8366042971611023, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 4058 + }, + { + "epoch": 3.2420127795527156, + "grad_norm": 0.5359059572219849, + "learning_rate": 0.0005, + "loss": 1.4185, + "step": 4059 + }, + { + "epoch": 3.242811501597444, + "grad_norm": 1.8511804342269897, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 4060 + }, + { + "epoch": 3.2436102236421727, + "grad_norm": 1.3229485750198364, + "learning_rate": 0.0005, + "loss": 1.4497, + "step": 4061 + }, + { + "epoch": 3.244408945686901, + "grad_norm": 0.8846393823623657, + "learning_rate": 0.0005, + "loss": 1.384, + "step": 4062 + }, + { + "epoch": 3.2452076677316293, + "grad_norm": 1.1345176696777344, + "learning_rate": 0.0005, + "loss": 1.3906, + "step": 4063 + }, + { + "epoch": 3.246006389776358, + "grad_norm": 0.998261034488678, + "learning_rate": 0.0005, + "loss": 1.3807, + "step": 4064 + }, + { + "epoch": 3.2468051118210863, + "grad_norm": 0.8998358249664307, + "learning_rate": 0.0005, + "loss": 1.3321, + "step": 4065 + }, + { + "epoch": 3.247603833865815, + "grad_norm": 0.6892838478088379, + "learning_rate": 0.0005, + "loss": 1.3718, + "step": 4066 + }, + { + "epoch": 3.248402555910543, + "grad_norm": 0.515389084815979, + "learning_rate": 0.0005, + "loss": 1.3296, + "step": 4067 + }, + { + "epoch": 3.2492012779552715, + "grad_norm": 0.41038376092910767, + "learning_rate": 0.0005, + "loss": 1.2855, + "step": 4068 + }, + { + "epoch": 3.25, + "grad_norm": 0.6094494462013245, + "learning_rate": 0.0005, + "loss": 1.2953, + "step": 4069 + }, + { + "epoch": 3.2507987220447285, + "grad_norm": 0.6274027228355408, + "learning_rate": 0.0005, + "loss": 1.2879, + "step": 4070 + }, + { + "epoch": 3.251597444089457, + "grad_norm": 0.8833006024360657, + "learning_rate": 0.0005, + "loss": 1.2806, + "step": 4071 + }, + { + "epoch": 3.252396166134185, + "grad_norm": 0.8688742518424988, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 4072 + }, + { + "epoch": 3.2531948881789137, + "grad_norm": 0.34751075506210327, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 4073 + }, + { + "epoch": 3.253993610223642, + "grad_norm": 0.4245823621749878, + "learning_rate": 0.0005, + "loss": 1.2529, + "step": 4074 + }, + { + "epoch": 3.2547923322683707, + "grad_norm": 0.4495961368083954, + "learning_rate": 0.0005, + "loss": 1.2438, + "step": 4075 + }, + { + "epoch": 3.255591054313099, + "grad_norm": 0.683125913143158, + "learning_rate": 0.0005, + "loss": 1.2297, + "step": 4076 + }, + { + "epoch": 3.2563897763578273, + "grad_norm": 0.4342438876628876, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 4077 + }, + { + "epoch": 3.257188498402556, + "grad_norm": 0.2018793523311615, + "learning_rate": 0.0005, + "loss": 1.2313, + "step": 4078 + }, + { + "epoch": 3.2579872204472844, + "grad_norm": 0.26145434379577637, + "learning_rate": 0.0005, + "loss": 1.218, + "step": 4079 + }, + { + "epoch": 3.258785942492013, + "grad_norm": 0.16941657662391663, + "learning_rate": 0.0005, + "loss": 1.2211, + "step": 4080 + }, + { + "epoch": 3.2595846645367414, + "grad_norm": 0.3158339262008667, + "learning_rate": 0.0005, + "loss": 1.2192, + "step": 4081 + }, + { + "epoch": 3.2603833865814695, + "grad_norm": 0.18630816042423248, + "learning_rate": 0.0005, + "loss": 1.2091, + "step": 4082 + }, + { + "epoch": 3.261182108626198, + "grad_norm": 0.19504855573177338, + "learning_rate": 0.0005, + "loss": 1.2047, + "step": 4083 + }, + { + "epoch": 3.2619808306709266, + "grad_norm": 0.19672146439552307, + "learning_rate": 0.0005, + "loss": 1.2022, + "step": 4084 + }, + { + "epoch": 3.262779552715655, + "grad_norm": 0.15959087014198303, + "learning_rate": 0.0005, + "loss": 1.1957, + "step": 4085 + }, + { + "epoch": 3.263578274760383, + "grad_norm": 0.18326745927333832, + "learning_rate": 0.0005, + "loss": 1.1835, + "step": 4086 + }, + { + "epoch": 3.2643769968051117, + "grad_norm": 0.23495830595493317, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 4087 + }, + { + "epoch": 3.2651757188498403, + "grad_norm": 0.22718247771263123, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 4088 + }, + { + "epoch": 3.265974440894569, + "grad_norm": 0.2913427948951721, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 4089 + }, + { + "epoch": 3.2667731629392973, + "grad_norm": 0.44531312584877014, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 4090 + }, + { + "epoch": 3.2675718849840254, + "grad_norm": 0.6265004277229309, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 4091 + }, + { + "epoch": 3.268370607028754, + "grad_norm": 0.6119574904441833, + "learning_rate": 0.0005, + "loss": 1.186, + "step": 4092 + }, + { + "epoch": 3.2691693290734825, + "grad_norm": 0.23989497125148773, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 4093 + }, + { + "epoch": 3.269968051118211, + "grad_norm": 0.266013503074646, + "learning_rate": 0.0005, + "loss": 1.1693, + "step": 4094 + }, + { + "epoch": 3.270766773162939, + "grad_norm": 0.2205667793750763, + "learning_rate": 0.0005, + "loss": 1.1627, + "step": 4095 + }, + { + "epoch": 3.2715654952076676, + "grad_norm": 0.4600715935230255, + "learning_rate": 0.0005, + "loss": 1.1566, + "step": 4096 + }, + { + "epoch": 3.272364217252396, + "grad_norm": 0.6725661754608154, + "learning_rate": 0.0005, + "loss": 1.1806, + "step": 4097 + }, + { + "epoch": 3.2731629392971247, + "grad_norm": 0.3836606442928314, + "learning_rate": 0.0005, + "loss": 1.1613, + "step": 4098 + }, + { + "epoch": 3.273961661341853, + "grad_norm": 0.3752588629722595, + "learning_rate": 0.0005, + "loss": 1.1639, + "step": 4099 + }, + { + "epoch": 3.2747603833865817, + "grad_norm": 0.3297381103038788, + "learning_rate": 0.0005, + "loss": 1.1488, + "step": 4100 + }, + { + "epoch": 3.27555910543131, + "grad_norm": 0.5899438858032227, + "learning_rate": 0.0005, + "loss": 1.1486, + "step": 4101 + }, + { + "epoch": 3.2763578274760383, + "grad_norm": 0.5899466872215271, + "learning_rate": 0.0005, + "loss": 1.1533, + "step": 4102 + }, + { + "epoch": 3.277156549520767, + "grad_norm": 0.2944958209991455, + "learning_rate": 0.0005, + "loss": 1.1517, + "step": 4103 + }, + { + "epoch": 3.2779552715654954, + "grad_norm": 0.5870373249053955, + "learning_rate": 0.0005, + "loss": 1.1484, + "step": 4104 + }, + { + "epoch": 3.2787539936102235, + "grad_norm": 0.25267326831817627, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 4105 + }, + { + "epoch": 3.279552715654952, + "grad_norm": 0.20602582395076752, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 4106 + }, + { + "epoch": 3.2803514376996805, + "grad_norm": 0.4151447117328644, + "learning_rate": 0.0005, + "loss": 1.1338, + "step": 4107 + }, + { + "epoch": 3.281150159744409, + "grad_norm": 0.6591519117355347, + "learning_rate": 0.0005, + "loss": 1.1395, + "step": 4108 + }, + { + "epoch": 3.2819488817891376, + "grad_norm": 0.48510807752609253, + "learning_rate": 0.0005, + "loss": 1.1496, + "step": 4109 + }, + { + "epoch": 3.2827476038338657, + "grad_norm": 0.27803128957748413, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 4110 + }, + { + "epoch": 3.283546325878594, + "grad_norm": 0.3939184546470642, + "learning_rate": 0.0005, + "loss": 1.141, + "step": 4111 + }, + { + "epoch": 3.2843450479233227, + "grad_norm": 0.18271984159946442, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 4112 + }, + { + "epoch": 3.2851437699680512, + "grad_norm": 0.19690747559070587, + "learning_rate": 0.0005, + "loss": 1.1286, + "step": 4113 + }, + { + "epoch": 3.2859424920127793, + "grad_norm": 0.22968755662441254, + "learning_rate": 0.0005, + "loss": 1.1316, + "step": 4114 + }, + { + "epoch": 3.286741214057508, + "grad_norm": 0.24908174574375153, + "learning_rate": 0.0005, + "loss": 1.1279, + "step": 4115 + }, + { + "epoch": 3.2875399361022364, + "grad_norm": 0.15813285112380981, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 4116 + }, + { + "epoch": 3.288338658146965, + "grad_norm": 0.1056000292301178, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 4117 + }, + { + "epoch": 3.2891373801916934, + "grad_norm": 0.19983351230621338, + "learning_rate": 0.0005, + "loss": 1.118, + "step": 4118 + }, + { + "epoch": 3.289936102236422, + "grad_norm": 0.13660027086734772, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 4119 + }, + { + "epoch": 3.29073482428115, + "grad_norm": 0.15008457005023956, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 4120 + }, + { + "epoch": 3.2915335463258786, + "grad_norm": 0.1475287824869156, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 4121 + }, + { + "epoch": 3.292332268370607, + "grad_norm": 0.10478811711072922, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 4122 + }, + { + "epoch": 3.2931309904153356, + "grad_norm": 0.1577034890651703, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 4123 + }, + { + "epoch": 3.2939297124600637, + "grad_norm": 0.1019970178604126, + "learning_rate": 0.0005, + "loss": 1.1117, + "step": 4124 + }, + { + "epoch": 3.2947284345047922, + "grad_norm": 0.09229713678359985, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 4125 + }, + { + "epoch": 3.2955271565495208, + "grad_norm": 0.10029986500740051, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 4126 + }, + { + "epoch": 3.2963258785942493, + "grad_norm": 0.14171569049358368, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4127 + }, + { + "epoch": 3.297124600638978, + "grad_norm": 0.17343609035015106, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 4128 + }, + { + "epoch": 3.297923322683706, + "grad_norm": 0.2738705277442932, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 4129 + }, + { + "epoch": 3.2987220447284344, + "grad_norm": 0.3518083691596985, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 4130 + }, + { + "epoch": 3.299520766773163, + "grad_norm": 0.16174353659152985, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 4131 + }, + { + "epoch": 3.3003194888178915, + "grad_norm": 0.24402645230293274, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4132 + }, + { + "epoch": 3.3011182108626196, + "grad_norm": 0.23362669348716736, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 4133 + }, + { + "epoch": 3.301916932907348, + "grad_norm": 0.1391523778438568, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 4134 + }, + { + "epoch": 3.3027156549520766, + "grad_norm": 0.1516295224428177, + "learning_rate": 0.0005, + "loss": 1.0968, + "step": 4135 + }, + { + "epoch": 3.303514376996805, + "grad_norm": 0.17463526129722595, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 4136 + }, + { + "epoch": 3.3043130990415337, + "grad_norm": 0.13717398047447205, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 4137 + }, + { + "epoch": 3.3051118210862622, + "grad_norm": 0.16802728176116943, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 4138 + }, + { + "epoch": 3.3059105431309903, + "grad_norm": 0.11959057301282883, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 4139 + }, + { + "epoch": 3.306709265175719, + "grad_norm": 0.07706355303525925, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4140 + }, + { + "epoch": 3.3075079872204474, + "grad_norm": 0.07729125767946243, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 4141 + }, + { + "epoch": 3.308306709265176, + "grad_norm": 0.08654871582984924, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4142 + }, + { + "epoch": 3.309105431309904, + "grad_norm": 0.11485479772090912, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 4143 + }, + { + "epoch": 3.3099041533546325, + "grad_norm": 0.10812658816576004, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4144 + }, + { + "epoch": 3.310702875399361, + "grad_norm": 0.08537860214710236, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4145 + }, + { + "epoch": 3.3115015974440896, + "grad_norm": 0.10628878325223923, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 4146 + }, + { + "epoch": 3.312300319488818, + "grad_norm": 0.14903275668621063, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 4147 + }, + { + "epoch": 3.313099041533546, + "grad_norm": 0.09670894593000412, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4148 + }, + { + "epoch": 3.3138977635782747, + "grad_norm": 0.10959025472402573, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 4149 + }, + { + "epoch": 3.3146964856230032, + "grad_norm": 0.10397703945636749, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 4150 + }, + { + "epoch": 3.3154952076677318, + "grad_norm": 0.07681623846292496, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 4151 + }, + { + "epoch": 3.31629392971246, + "grad_norm": 0.07938152551651001, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 4152 + }, + { + "epoch": 3.3170926517571884, + "grad_norm": 0.14678052067756653, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 4153 + }, + { + "epoch": 3.317891373801917, + "grad_norm": 0.15366105735301971, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4154 + }, + { + "epoch": 3.3186900958466454, + "grad_norm": 0.13449597358703613, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 4155 + }, + { + "epoch": 3.319488817891374, + "grad_norm": 0.0861068144440651, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 4156 + }, + { + "epoch": 3.3202875399361025, + "grad_norm": 0.0604286752641201, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 4157 + }, + { + "epoch": 3.3210862619808306, + "grad_norm": 0.08299542963504791, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4158 + }, + { + "epoch": 3.321884984025559, + "grad_norm": 0.0738200917840004, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 4159 + }, + { + "epoch": 3.3226837060702876, + "grad_norm": 0.06450676172971725, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4160 + }, + { + "epoch": 3.323482428115016, + "grad_norm": 0.055281370878219604, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4161 + }, + { + "epoch": 3.3242811501597442, + "grad_norm": 0.09895910322666168, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4162 + }, + { + "epoch": 3.3250798722044728, + "grad_norm": 0.10338333994150162, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 4163 + }, + { + "epoch": 3.3258785942492013, + "grad_norm": 0.08346354216337204, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 4164 + }, + { + "epoch": 3.32667731629393, + "grad_norm": 0.15257857739925385, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 4165 + }, + { + "epoch": 3.3274760383386583, + "grad_norm": 0.1782383918762207, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4166 + }, + { + "epoch": 3.3282747603833864, + "grad_norm": 0.09908363968133926, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4167 + }, + { + "epoch": 3.329073482428115, + "grad_norm": 0.18942143023014069, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 4168 + }, + { + "epoch": 3.3298722044728435, + "grad_norm": 0.21095149219036102, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4169 + }, + { + "epoch": 3.330670926517572, + "grad_norm": 0.11597894132137299, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4170 + }, + { + "epoch": 3.3314696485623, + "grad_norm": 0.20450811088085175, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 4171 + }, + { + "epoch": 3.3322683706070286, + "grad_norm": 0.1609300971031189, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4172 + }, + { + "epoch": 3.333067092651757, + "grad_norm": 0.14068877696990967, + "learning_rate": 0.0005, + "loss": 1.0835, + "step": 4173 + }, + { + "epoch": 3.3338658146964857, + "grad_norm": 0.11969266831874847, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 4174 + }, + { + "epoch": 3.334664536741214, + "grad_norm": 0.16986626386642456, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4175 + }, + { + "epoch": 3.3354632587859427, + "grad_norm": 0.2065591812133789, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4176 + }, + { + "epoch": 3.336261980830671, + "grad_norm": 0.23542748391628265, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 4177 + }, + { + "epoch": 3.3370607028753994, + "grad_norm": 0.20896919071674347, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 4178 + }, + { + "epoch": 3.337859424920128, + "grad_norm": 0.16446076333522797, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 4179 + }, + { + "epoch": 3.3386581469648564, + "grad_norm": 0.11143177002668381, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 4180 + }, + { + "epoch": 3.3394568690095845, + "grad_norm": 0.0866970345377922, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 4181 + }, + { + "epoch": 3.340255591054313, + "grad_norm": 0.14608244597911835, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 4182 + }, + { + "epoch": 3.3410543130990416, + "grad_norm": 0.06152384728193283, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4183 + }, + { + "epoch": 3.34185303514377, + "grad_norm": 0.14289656281471252, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4184 + }, + { + "epoch": 3.3426517571884986, + "grad_norm": 0.16735558211803436, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4185 + }, + { + "epoch": 3.3434504792332267, + "grad_norm": 0.09012678265571594, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 4186 + }, + { + "epoch": 3.344249201277955, + "grad_norm": 0.05861378088593483, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4187 + }, + { + "epoch": 3.3450479233226837, + "grad_norm": 0.07123090326786041, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4188 + }, + { + "epoch": 3.3458466453674123, + "grad_norm": 0.07879375666379929, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4189 + }, + { + "epoch": 3.3466453674121404, + "grad_norm": 0.0925324484705925, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 4190 + }, + { + "epoch": 3.347444089456869, + "grad_norm": 0.0686444416642189, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4191 + }, + { + "epoch": 3.3482428115015974, + "grad_norm": 0.08633724600076675, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4192 + }, + { + "epoch": 3.349041533546326, + "grad_norm": 0.056881021708250046, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4193 + }, + { + "epoch": 3.3498402555910545, + "grad_norm": 0.07752947509288788, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4194 + }, + { + "epoch": 3.3506389776357826, + "grad_norm": 0.0927717313170433, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 4195 + }, + { + "epoch": 3.351437699680511, + "grad_norm": 0.09599179029464722, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 4196 + }, + { + "epoch": 3.3522364217252396, + "grad_norm": 0.09090889245271683, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 4197 + }, + { + "epoch": 3.353035143769968, + "grad_norm": 0.12757429480552673, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 4198 + }, + { + "epoch": 3.3538338658146967, + "grad_norm": 0.15210460126399994, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4199 + }, + { + "epoch": 3.3546325878594248, + "grad_norm": 0.10982836782932281, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4200 + }, + { + "epoch": 3.3554313099041533, + "grad_norm": 0.056641776114702225, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4201 + }, + { + "epoch": 3.356230031948882, + "grad_norm": 0.09506776928901672, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4202 + }, + { + "epoch": 3.3570287539936103, + "grad_norm": 0.12064918130636215, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4203 + }, + { + "epoch": 3.357827476038339, + "grad_norm": 0.12343298643827438, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 4204 + }, + { + "epoch": 3.358626198083067, + "grad_norm": 0.11508476734161377, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4205 + }, + { + "epoch": 3.3594249201277955, + "grad_norm": 0.07552453875541687, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4206 + }, + { + "epoch": 3.360223642172524, + "grad_norm": 0.10495936870574951, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 4207 + }, + { + "epoch": 3.3610223642172525, + "grad_norm": 0.13230633735656738, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4208 + }, + { + "epoch": 3.3618210862619806, + "grad_norm": 0.13003787398338318, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4209 + }, + { + "epoch": 3.362619808306709, + "grad_norm": 0.09252234548330307, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 4210 + }, + { + "epoch": 3.3634185303514377, + "grad_norm": 0.07739317417144775, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 4211 + }, + { + "epoch": 3.364217252396166, + "grad_norm": 0.12185318768024445, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4212 + }, + { + "epoch": 3.3650159744408947, + "grad_norm": 0.17643119394779205, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4213 + }, + { + "epoch": 3.365814696485623, + "grad_norm": 0.10462872684001923, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4214 + }, + { + "epoch": 3.3666134185303513, + "grad_norm": 0.1486569344997406, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 4215 + }, + { + "epoch": 3.36741214057508, + "grad_norm": 0.11858930438756943, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4216 + }, + { + "epoch": 3.3682108626198084, + "grad_norm": 0.07907772809267044, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4217 + }, + { + "epoch": 3.369009584664537, + "grad_norm": 0.5416387319564819, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 4218 + }, + { + "epoch": 3.369808306709265, + "grad_norm": 0.08767322450876236, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4219 + }, + { + "epoch": 3.3706070287539935, + "grad_norm": 0.09651107341051102, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4220 + }, + { + "epoch": 3.371405750798722, + "grad_norm": 0.07548791915178299, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4221 + }, + { + "epoch": 3.3722044728434506, + "grad_norm": 0.09317605197429657, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4222 + }, + { + "epoch": 3.373003194888179, + "grad_norm": 0.07431582361459732, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4223 + }, + { + "epoch": 3.373801916932907, + "grad_norm": 0.12754018604755402, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4224 + }, + { + "epoch": 3.3746006389776357, + "grad_norm": 0.12697845697402954, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4225 + }, + { + "epoch": 3.3753993610223643, + "grad_norm": 0.21522995829582214, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4226 + }, + { + "epoch": 3.376198083067093, + "grad_norm": 0.08886270225048065, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 4227 + }, + { + "epoch": 3.376996805111821, + "grad_norm": 0.07107655704021454, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4228 + }, + { + "epoch": 3.3777955271565494, + "grad_norm": 0.07452798634767532, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 4229 + }, + { + "epoch": 3.378594249201278, + "grad_norm": 0.10205573588609695, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4230 + }, + { + "epoch": 3.3793929712460065, + "grad_norm": 0.10990341752767563, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4231 + }, + { + "epoch": 3.380191693290735, + "grad_norm": 0.08567643165588379, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4232 + }, + { + "epoch": 3.380990415335463, + "grad_norm": 0.057073548436164856, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4233 + }, + { + "epoch": 3.3817891373801916, + "grad_norm": 0.12602978944778442, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4234 + }, + { + "epoch": 3.38258785942492, + "grad_norm": 0.1715400218963623, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4235 + }, + { + "epoch": 3.3833865814696487, + "grad_norm": 0.13129903376102448, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 4236 + }, + { + "epoch": 3.384185303514377, + "grad_norm": 0.1308225691318512, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4237 + }, + { + "epoch": 3.3849840255591053, + "grad_norm": 0.1353990137577057, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4238 + }, + { + "epoch": 3.385782747603834, + "grad_norm": 0.08648121356964111, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4239 + }, + { + "epoch": 3.3865814696485623, + "grad_norm": 0.23568236827850342, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4240 + }, + { + "epoch": 3.387380191693291, + "grad_norm": 0.20514735579490662, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4241 + }, + { + "epoch": 3.3881789137380194, + "grad_norm": 0.10276424884796143, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4242 + }, + { + "epoch": 3.3889776357827475, + "grad_norm": 0.1838751584291458, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4243 + }, + { + "epoch": 3.389776357827476, + "grad_norm": 0.1697031557559967, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4244 + }, + { + "epoch": 3.3905750798722045, + "grad_norm": 0.11439084261655807, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4245 + }, + { + "epoch": 3.391373801916933, + "grad_norm": 0.14021249115467072, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4246 + }, + { + "epoch": 3.392172523961661, + "grad_norm": 0.13989558815956116, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4247 + }, + { + "epoch": 3.3929712460063897, + "grad_norm": 0.12039095908403397, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4248 + }, + { + "epoch": 3.393769968051118, + "grad_norm": 0.17901045083999634, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4249 + }, + { + "epoch": 3.3945686900958467, + "grad_norm": 0.1053776666522026, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 4250 + }, + { + "epoch": 3.3953674121405752, + "grad_norm": 1.7777512073516846, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4251 + }, + { + "epoch": 3.3961661341853033, + "grad_norm": 0.06677904725074768, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 4252 + }, + { + "epoch": 3.396964856230032, + "grad_norm": 0.16123540699481964, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4253 + }, + { + "epoch": 3.3977635782747604, + "grad_norm": 0.21530884504318237, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4254 + }, + { + "epoch": 3.398562300319489, + "grad_norm": 0.20979386568069458, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4255 + }, + { + "epoch": 3.3993610223642174, + "grad_norm": 0.14755229651927948, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 4256 + }, + { + "epoch": 3.4001597444089455, + "grad_norm": 0.10182930529117584, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4257 + }, + { + "epoch": 3.400958466453674, + "grad_norm": 0.11478064954280853, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4258 + }, + { + "epoch": 3.4017571884984026, + "grad_norm": 0.2052452266216278, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4259 + }, + { + "epoch": 3.402555910543131, + "grad_norm": 0.6292023062705994, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4260 + }, + { + "epoch": 3.4033546325878596, + "grad_norm": 0.0666726678609848, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4261 + }, + { + "epoch": 3.4041533546325877, + "grad_norm": 0.11848346143960953, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4262 + }, + { + "epoch": 3.4049520766773163, + "grad_norm": 0.15276756882667542, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4263 + }, + { + "epoch": 3.405750798722045, + "grad_norm": 0.08534786105155945, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4264 + }, + { + "epoch": 3.4065495207667733, + "grad_norm": 0.07453266531229019, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4265 + }, + { + "epoch": 3.4073482428115014, + "grad_norm": 0.12894752621650696, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4266 + }, + { + "epoch": 3.40814696485623, + "grad_norm": 0.11341612786054611, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4267 + }, + { + "epoch": 3.4089456869009584, + "grad_norm": 0.06551265716552734, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4268 + }, + { + "epoch": 3.409744408945687, + "grad_norm": 0.08828622102737427, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4269 + }, + { + "epoch": 3.4105431309904155, + "grad_norm": 0.06951884925365448, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4270 + }, + { + "epoch": 3.4113418530351436, + "grad_norm": 0.0785432904958725, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4271 + }, + { + "epoch": 3.412140575079872, + "grad_norm": 0.06681766360998154, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4272 + }, + { + "epoch": 3.4129392971246006, + "grad_norm": 0.060111526399850845, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4273 + }, + { + "epoch": 3.413738019169329, + "grad_norm": 0.07451382279396057, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 4274 + }, + { + "epoch": 3.4145367412140573, + "grad_norm": 0.08646225184202194, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4275 + }, + { + "epoch": 3.415335463258786, + "grad_norm": 0.07061789929866791, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4276 + }, + { + "epoch": 3.4161341853035143, + "grad_norm": 0.09554821997880936, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4277 + }, + { + "epoch": 3.416932907348243, + "grad_norm": 0.11288002133369446, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4278 + }, + { + "epoch": 3.4177316293929714, + "grad_norm": 0.10565607994794846, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4279 + }, + { + "epoch": 3.4185303514377, + "grad_norm": 0.08235503733158112, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4280 + }, + { + "epoch": 3.419329073482428, + "grad_norm": 0.1302265226840973, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4281 + }, + { + "epoch": 3.4201277955271565, + "grad_norm": 0.07910848408937454, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4282 + }, + { + "epoch": 3.420926517571885, + "grad_norm": 0.10624215006828308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4283 + }, + { + "epoch": 3.4217252396166136, + "grad_norm": 0.08545158058404922, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4284 + }, + { + "epoch": 3.4225239616613417, + "grad_norm": 0.07010428607463837, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4285 + }, + { + "epoch": 3.42332268370607, + "grad_norm": 0.08256867527961731, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4286 + }, + { + "epoch": 3.4241214057507987, + "grad_norm": 0.13074247539043427, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4287 + }, + { + "epoch": 3.4249201277955272, + "grad_norm": 0.18332679569721222, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4288 + }, + { + "epoch": 3.4257188498402558, + "grad_norm": 0.1671689748764038, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4289 + }, + { + "epoch": 3.426517571884984, + "grad_norm": 0.10386296361684799, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4290 + }, + { + "epoch": 3.4273162939297124, + "grad_norm": 0.07554108649492264, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 4291 + }, + { + "epoch": 3.428115015974441, + "grad_norm": 0.1138196587562561, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4292 + }, + { + "epoch": 3.4289137380191694, + "grad_norm": 0.1681462526321411, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4293 + }, + { + "epoch": 3.4297124600638975, + "grad_norm": 0.1833198368549347, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4294 + }, + { + "epoch": 3.430511182108626, + "grad_norm": 0.10269228368997574, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4295 + }, + { + "epoch": 3.4313099041533546, + "grad_norm": 0.08876223117113113, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4296 + }, + { + "epoch": 3.432108626198083, + "grad_norm": 0.21489253640174866, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4297 + }, + { + "epoch": 3.4329073482428116, + "grad_norm": 0.22669701278209686, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4298 + }, + { + "epoch": 3.43370607028754, + "grad_norm": 0.16946858167648315, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4299 + }, + { + "epoch": 3.4345047923322682, + "grad_norm": 0.05162649229168892, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4300 + }, + { + "epoch": 3.4353035143769968, + "grad_norm": 0.09700657427310944, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4301 + }, + { + "epoch": 3.4361022364217253, + "grad_norm": 0.14858263731002808, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4302 + }, + { + "epoch": 3.436900958466454, + "grad_norm": 0.16938818991184235, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4303 + }, + { + "epoch": 3.437699680511182, + "grad_norm": 0.13441702723503113, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4304 + }, + { + "epoch": 3.4384984025559104, + "grad_norm": 0.07661818712949753, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4305 + }, + { + "epoch": 3.439297124600639, + "grad_norm": 0.19436489045619965, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4306 + }, + { + "epoch": 3.4400958466453675, + "grad_norm": 0.20447906851768494, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4307 + }, + { + "epoch": 3.440894568690096, + "grad_norm": 0.1414622664451599, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4308 + }, + { + "epoch": 3.441693290734824, + "grad_norm": 0.06289447098970413, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4309 + }, + { + "epoch": 3.4424920127795526, + "grad_norm": 0.0966482162475586, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4310 + }, + { + "epoch": 3.443290734824281, + "grad_norm": 0.1300116777420044, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4311 + }, + { + "epoch": 3.4440894568690097, + "grad_norm": 0.11638098210096359, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4312 + }, + { + "epoch": 3.4448881789137378, + "grad_norm": 0.08284632116556168, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4313 + }, + { + "epoch": 3.4456869009584663, + "grad_norm": 0.0617060512304306, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4314 + }, + { + "epoch": 3.446485623003195, + "grad_norm": 0.12798283994197845, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4315 + }, + { + "epoch": 3.4472843450479234, + "grad_norm": 0.12712593376636505, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4316 + }, + { + "epoch": 3.448083067092652, + "grad_norm": 0.09164460003376007, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4317 + }, + { + "epoch": 3.4488817891373804, + "grad_norm": 0.07618964463472366, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4318 + }, + { + "epoch": 3.4496805111821085, + "grad_norm": 0.07986288517713547, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4319 + }, + { + "epoch": 3.450479233226837, + "grad_norm": 0.0783228650689125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4320 + }, + { + "epoch": 3.4512779552715656, + "grad_norm": 0.09899114072322845, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4321 + }, + { + "epoch": 3.452076677316294, + "grad_norm": 0.13710227608680725, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4322 + }, + { + "epoch": 3.452875399361022, + "grad_norm": 0.1281789392232895, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4323 + }, + { + "epoch": 3.4536741214057507, + "grad_norm": 0.11021110415458679, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4324 + }, + { + "epoch": 3.4544728434504792, + "grad_norm": 0.11450989544391632, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4325 + }, + { + "epoch": 3.4552715654952078, + "grad_norm": 0.09010434150695801, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4326 + }, + { + "epoch": 3.4560702875399363, + "grad_norm": 0.08817321807146072, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4327 + }, + { + "epoch": 3.4568690095846644, + "grad_norm": 0.06502921879291534, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4328 + }, + { + "epoch": 3.457667731629393, + "grad_norm": 0.13399769365787506, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4329 + }, + { + "epoch": 3.4584664536741214, + "grad_norm": 0.19785602390766144, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4330 + }, + { + "epoch": 3.45926517571885, + "grad_norm": 0.15761834383010864, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4331 + }, + { + "epoch": 3.460063897763578, + "grad_norm": 0.11824636161327362, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4332 + }, + { + "epoch": 3.4608626198083066, + "grad_norm": 0.07031631469726562, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4333 + }, + { + "epoch": 3.461661341853035, + "grad_norm": 0.09940601140260696, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4334 + }, + { + "epoch": 3.4624600638977636, + "grad_norm": 0.11931589990854263, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4335 + }, + { + "epoch": 3.463258785942492, + "grad_norm": 0.0967932790517807, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4336 + }, + { + "epoch": 3.4640575079872207, + "grad_norm": 0.09523937106132507, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4337 + }, + { + "epoch": 3.4648562300319488, + "grad_norm": 0.09964902698993683, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4338 + }, + { + "epoch": 3.4656549520766773, + "grad_norm": 0.09898022562265396, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4339 + }, + { + "epoch": 3.466453674121406, + "grad_norm": 0.05388521030545235, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4340 + }, + { + "epoch": 3.4672523961661343, + "grad_norm": 0.06455415487289429, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4341 + }, + { + "epoch": 3.4680511182108624, + "grad_norm": 0.05497310310602188, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4342 + }, + { + "epoch": 3.468849840255591, + "grad_norm": 0.049679841846227646, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4343 + }, + { + "epoch": 3.4696485623003195, + "grad_norm": 0.05664939060807228, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4344 + }, + { + "epoch": 3.470447284345048, + "grad_norm": 0.06651245057582855, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4345 + }, + { + "epoch": 3.4712460063897765, + "grad_norm": 0.08480475097894669, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4346 + }, + { + "epoch": 3.4720447284345046, + "grad_norm": 0.07331875711679459, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4347 + }, + { + "epoch": 3.472843450479233, + "grad_norm": 0.0505477711558342, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4348 + }, + { + "epoch": 3.4736421725239617, + "grad_norm": 0.06969176232814789, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4349 + }, + { + "epoch": 3.47444089456869, + "grad_norm": 0.08915391564369202, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4350 + }, + { + "epoch": 3.4752396166134183, + "grad_norm": 0.09378752112388611, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4351 + }, + { + "epoch": 3.476038338658147, + "grad_norm": 0.059195373207330704, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4352 + }, + { + "epoch": 3.4768370607028753, + "grad_norm": 0.07094884663820267, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4353 + }, + { + "epoch": 3.477635782747604, + "grad_norm": 0.11091995984315872, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4354 + }, + { + "epoch": 3.4784345047923324, + "grad_norm": 0.14018885791301727, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4355 + }, + { + "epoch": 3.479233226837061, + "grad_norm": 0.13553708791732788, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4356 + }, + { + "epoch": 3.480031948881789, + "grad_norm": 0.08005240559577942, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4357 + }, + { + "epoch": 3.4808306709265175, + "grad_norm": 0.05309261009097099, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4358 + }, + { + "epoch": 3.481629392971246, + "grad_norm": 0.09956394135951996, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4359 + }, + { + "epoch": 3.4824281150159746, + "grad_norm": 0.13189470767974854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4360 + }, + { + "epoch": 3.4832268370607027, + "grad_norm": 0.13651393353939056, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4361 + }, + { + "epoch": 3.484025559105431, + "grad_norm": 0.12467528879642487, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4362 + }, + { + "epoch": 3.4848242811501597, + "grad_norm": 0.11428561061620712, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4363 + }, + { + "epoch": 3.4856230031948883, + "grad_norm": 0.12095288187265396, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4364 + }, + { + "epoch": 3.486421725239617, + "grad_norm": 0.05889631807804108, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4365 + }, + { + "epoch": 3.487220447284345, + "grad_norm": 0.1158040463924408, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4366 + }, + { + "epoch": 3.4880191693290734, + "grad_norm": 0.11070148646831512, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4367 + }, + { + "epoch": 3.488817891373802, + "grad_norm": 0.0625298023223877, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4368 + }, + { + "epoch": 3.4896166134185305, + "grad_norm": 0.11865562945604324, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4369 + }, + { + "epoch": 3.4904153354632586, + "grad_norm": 0.12237154692411423, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4370 + }, + { + "epoch": 3.491214057507987, + "grad_norm": 0.05703050270676613, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4371 + }, + { + "epoch": 3.4920127795527156, + "grad_norm": 0.17314022779464722, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4372 + }, + { + "epoch": 3.492811501597444, + "grad_norm": 0.2984711825847626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4373 + }, + { + "epoch": 3.4936102236421727, + "grad_norm": 0.30129608511924744, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4374 + }, + { + "epoch": 3.494408945686901, + "grad_norm": 0.12154170870780945, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4375 + }, + { + "epoch": 3.4952076677316293, + "grad_norm": 0.12467148154973984, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4376 + }, + { + "epoch": 3.496006389776358, + "grad_norm": 0.23285721242427826, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4377 + }, + { + "epoch": 3.4968051118210863, + "grad_norm": 0.20723310112953186, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4378 + }, + { + "epoch": 3.497603833865815, + "grad_norm": 0.13221028447151184, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4379 + }, + { + "epoch": 3.498402555910543, + "grad_norm": 0.06008061394095421, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4380 + }, + { + "epoch": 3.4992012779552715, + "grad_norm": 0.12877988815307617, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4381 + }, + { + "epoch": 3.5, + "grad_norm": 0.1951032429933548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4382 + }, + { + "epoch": 3.5007987220447285, + "grad_norm": 0.13804258406162262, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4383 + }, + { + "epoch": 3.501597444089457, + "grad_norm": 0.06761720031499863, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4384 + }, + { + "epoch": 3.502396166134185, + "grad_norm": 0.13217084109783173, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4385 + }, + { + "epoch": 3.5031948881789137, + "grad_norm": 0.11773377656936646, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4386 + }, + { + "epoch": 3.503993610223642, + "grad_norm": 0.07580399513244629, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4387 + }, + { + "epoch": 3.5047923322683707, + "grad_norm": 0.1739586442708969, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4388 + }, + { + "epoch": 3.505591054313099, + "grad_norm": 0.14863203465938568, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4389 + }, + { + "epoch": 3.5063897763578273, + "grad_norm": 0.07858511805534363, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4390 + }, + { + "epoch": 3.507188498402556, + "grad_norm": 0.15966418385505676, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4391 + }, + { + "epoch": 3.5079872204472844, + "grad_norm": 0.28761810064315796, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4392 + }, + { + "epoch": 3.508785942492013, + "grad_norm": 0.24169668555259705, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4393 + }, + { + "epoch": 3.5095846645367414, + "grad_norm": 0.07907059788703918, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4394 + }, + { + "epoch": 3.5103833865814695, + "grad_norm": 0.20243291556835175, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4395 + }, + { + "epoch": 3.511182108626198, + "grad_norm": 0.302198588848114, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4396 + }, + { + "epoch": 3.5119808306709266, + "grad_norm": 0.2544843554496765, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4397 + }, + { + "epoch": 3.512779552715655, + "grad_norm": 0.07381684333086014, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4398 + }, + { + "epoch": 3.513578274760383, + "grad_norm": 0.17388348281383514, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4399 + }, + { + "epoch": 3.5143769968051117, + "grad_norm": 0.2293306440114975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4400 + }, + { + "epoch": 3.5151757188498403, + "grad_norm": 0.07548263669013977, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4401 + }, + { + "epoch": 3.515974440894569, + "grad_norm": 0.1924273669719696, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4402 + }, + { + "epoch": 3.5167731629392973, + "grad_norm": 0.26867300271987915, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4403 + }, + { + "epoch": 3.5175718849840254, + "grad_norm": 0.14461541175842285, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4404 + }, + { + "epoch": 3.518370607028754, + "grad_norm": 0.12608370184898376, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4405 + }, + { + "epoch": 3.5191693290734825, + "grad_norm": 0.20579756796360016, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4406 + }, + { + "epoch": 3.519968051118211, + "grad_norm": 0.12286399304866791, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4407 + }, + { + "epoch": 3.520766773162939, + "grad_norm": 0.055247388780117035, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4408 + }, + { + "epoch": 3.5215654952076676, + "grad_norm": 0.07877562195062637, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4409 + }, + { + "epoch": 3.522364217252396, + "grad_norm": 0.0769568607211113, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4410 + }, + { + "epoch": 3.5231629392971247, + "grad_norm": 0.0898609384894371, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4411 + }, + { + "epoch": 3.523961661341853, + "grad_norm": 0.057637594640254974, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4412 + }, + { + "epoch": 3.5247603833865817, + "grad_norm": 0.12046241015195847, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4413 + }, + { + "epoch": 3.52555910543131, + "grad_norm": 0.09949496388435364, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4414 + }, + { + "epoch": 3.5263578274760383, + "grad_norm": 0.054411277174949646, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4415 + }, + { + "epoch": 3.527156549520767, + "grad_norm": 0.08293551951646805, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4416 + }, + { + "epoch": 3.527955271565495, + "grad_norm": 0.07669435441493988, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4417 + }, + { + "epoch": 3.5287539936102235, + "grad_norm": 0.06382326781749725, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4418 + }, + { + "epoch": 3.529552715654952, + "grad_norm": 0.07673322409391403, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4419 + }, + { + "epoch": 3.5303514376996805, + "grad_norm": 0.08052650839090347, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4420 + }, + { + "epoch": 3.531150159744409, + "grad_norm": 0.1354246884584427, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4421 + }, + { + "epoch": 3.5319488817891376, + "grad_norm": 0.07951574772596359, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 4422 + }, + { + "epoch": 3.5327476038338657, + "grad_norm": 0.11002526432275772, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4423 + }, + { + "epoch": 3.533546325878594, + "grad_norm": 0.18597234785556793, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4424 + }, + { + "epoch": 3.5343450479233227, + "grad_norm": 0.12601099908351898, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4425 + }, + { + "epoch": 3.5351437699680512, + "grad_norm": 0.11181886494159698, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4426 + }, + { + "epoch": 3.5359424920127793, + "grad_norm": 0.11489108949899673, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4427 + }, + { + "epoch": 3.536741214057508, + "grad_norm": 0.10422708839178085, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4428 + }, + { + "epoch": 3.5375399361022364, + "grad_norm": 0.13701972365379333, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4429 + }, + { + "epoch": 3.538338658146965, + "grad_norm": 0.10713281482458115, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4430 + }, + { + "epoch": 3.5391373801916934, + "grad_norm": 0.11508526653051376, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4431 + }, + { + "epoch": 3.539936102236422, + "grad_norm": 0.061856236308813095, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4432 + }, + { + "epoch": 3.54073482428115, + "grad_norm": 0.12080623209476471, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4433 + }, + { + "epoch": 3.5415335463258786, + "grad_norm": 0.12233573198318481, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4434 + }, + { + "epoch": 3.542332268370607, + "grad_norm": 0.07041362673044205, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4435 + }, + { + "epoch": 3.543130990415335, + "grad_norm": 0.1162526085972786, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4436 + }, + { + "epoch": 3.5439297124600637, + "grad_norm": 0.12962234020233154, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4437 + }, + { + "epoch": 3.5447284345047922, + "grad_norm": 0.1368536353111267, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4438 + }, + { + "epoch": 3.5455271565495208, + "grad_norm": 0.061806995421648026, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4439 + }, + { + "epoch": 3.5463258785942493, + "grad_norm": 0.11016163975000381, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4440 + }, + { + "epoch": 3.547124600638978, + "grad_norm": 0.0992715135216713, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4441 + }, + { + "epoch": 3.547923322683706, + "grad_norm": 0.14015190303325653, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4442 + }, + { + "epoch": 3.5487220447284344, + "grad_norm": 0.07255455106496811, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4443 + }, + { + "epoch": 3.549520766773163, + "grad_norm": 0.13293872773647308, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4444 + }, + { + "epoch": 3.5503194888178915, + "grad_norm": 0.08923539519309998, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4445 + }, + { + "epoch": 3.5511182108626196, + "grad_norm": 0.10125918686389923, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4446 + }, + { + "epoch": 3.551916932907348, + "grad_norm": 0.12369748950004578, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4447 + }, + { + "epoch": 3.5527156549520766, + "grad_norm": 0.14656996726989746, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4448 + }, + { + "epoch": 3.553514376996805, + "grad_norm": 0.14212539792060852, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4449 + }, + { + "epoch": 3.5543130990415337, + "grad_norm": 0.08640166372060776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4450 + }, + { + "epoch": 3.5551118210862622, + "grad_norm": 0.05552735924720764, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4451 + }, + { + "epoch": 3.5559105431309903, + "grad_norm": 0.12888140976428986, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4452 + }, + { + "epoch": 3.556709265175719, + "grad_norm": 0.10696940869092941, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4453 + }, + { + "epoch": 3.5575079872204474, + "grad_norm": 0.06578963249921799, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4454 + }, + { + "epoch": 3.5583067092651754, + "grad_norm": 0.16173291206359863, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4455 + }, + { + "epoch": 3.559105431309904, + "grad_norm": 0.1550486832857132, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4456 + }, + { + "epoch": 3.5599041533546325, + "grad_norm": 0.14084209501743317, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4457 + }, + { + "epoch": 3.560702875399361, + "grad_norm": 0.12024512141942978, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4458 + }, + { + "epoch": 3.5615015974440896, + "grad_norm": 0.12514936923980713, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4459 + }, + { + "epoch": 3.562300319488818, + "grad_norm": 0.16444219648838043, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4460 + }, + { + "epoch": 3.563099041533546, + "grad_norm": 0.11520830541849136, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4461 + }, + { + "epoch": 3.5638977635782747, + "grad_norm": 0.07884586602449417, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4462 + }, + { + "epoch": 3.5646964856230032, + "grad_norm": 0.1655684858560562, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4463 + }, + { + "epoch": 3.5654952076677318, + "grad_norm": 0.15222500264644623, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4464 + }, + { + "epoch": 3.56629392971246, + "grad_norm": 0.06106618419289589, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4465 + }, + { + "epoch": 3.5670926517571884, + "grad_norm": 0.10545333474874496, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4466 + }, + { + "epoch": 3.567891373801917, + "grad_norm": 0.1353088915348053, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4467 + }, + { + "epoch": 3.5686900958466454, + "grad_norm": 0.11200091242790222, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4468 + }, + { + "epoch": 3.569488817891374, + "grad_norm": 0.052965741604566574, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4469 + }, + { + "epoch": 3.5702875399361025, + "grad_norm": 0.1244843453168869, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4470 + }, + { + "epoch": 3.5710862619808306, + "grad_norm": 0.1160016730427742, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4471 + }, + { + "epoch": 3.571884984025559, + "grad_norm": 0.04874402657151222, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4472 + }, + { + "epoch": 3.5726837060702876, + "grad_norm": 0.14222301542758942, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4473 + }, + { + "epoch": 3.5734824281150157, + "grad_norm": 0.1190859004855156, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4474 + }, + { + "epoch": 3.5742811501597442, + "grad_norm": 0.0659632682800293, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4475 + }, + { + "epoch": 3.5750798722044728, + "grad_norm": 0.07350483536720276, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4476 + }, + { + "epoch": 3.5758785942492013, + "grad_norm": 0.1220504492521286, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4477 + }, + { + "epoch": 3.57667731629393, + "grad_norm": 0.08952966332435608, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4478 + }, + { + "epoch": 3.5774760383386583, + "grad_norm": 0.08828000724315643, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4479 + }, + { + "epoch": 3.5782747603833864, + "grad_norm": 0.14621564745903015, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4480 + }, + { + "epoch": 3.579073482428115, + "grad_norm": 0.13653770089149475, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4481 + }, + { + "epoch": 3.5798722044728435, + "grad_norm": 0.0682564228773117, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4482 + }, + { + "epoch": 3.580670926517572, + "grad_norm": 0.06511309742927551, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4483 + }, + { + "epoch": 3.5814696485623, + "grad_norm": 0.08800239861011505, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4484 + }, + { + "epoch": 3.5822683706070286, + "grad_norm": 0.06488335877656937, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4485 + }, + { + "epoch": 3.583067092651757, + "grad_norm": 0.06505738198757172, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4486 + }, + { + "epoch": 3.5838658146964857, + "grad_norm": 0.07395542412996292, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4487 + }, + { + "epoch": 3.584664536741214, + "grad_norm": 0.06717971712350845, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4488 + }, + { + "epoch": 3.5854632587859427, + "grad_norm": 0.056708067655563354, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4489 + }, + { + "epoch": 3.586261980830671, + "grad_norm": 0.06316737830638885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4490 + }, + { + "epoch": 3.5870607028753994, + "grad_norm": 0.06079665198922157, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4491 + }, + { + "epoch": 3.587859424920128, + "grad_norm": 0.1293981820344925, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4492 + }, + { + "epoch": 3.588658146964856, + "grad_norm": 0.08021418750286102, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4493 + }, + { + "epoch": 3.5894568690095845, + "grad_norm": 0.096865214407444, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4494 + }, + { + "epoch": 3.590255591054313, + "grad_norm": 0.06794966757297516, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4495 + }, + { + "epoch": 3.5910543130990416, + "grad_norm": 0.04527222737669945, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4496 + }, + { + "epoch": 3.59185303514377, + "grad_norm": 0.07153941690921783, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4497 + }, + { + "epoch": 3.5926517571884986, + "grad_norm": 0.07480445504188538, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4498 + }, + { + "epoch": 3.5934504792332267, + "grad_norm": 0.09161835163831711, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4499 + }, + { + "epoch": 3.594249201277955, + "grad_norm": 0.08420681953430176, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4500 + }, + { + "epoch": 3.5950479233226837, + "grad_norm": 0.04745415225625038, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4501 + }, + { + "epoch": 3.5958466453674123, + "grad_norm": 0.061325494199991226, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4502 + }, + { + "epoch": 3.5966453674121404, + "grad_norm": 0.08550430834293365, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4503 + }, + { + "epoch": 3.597444089456869, + "grad_norm": 0.09530419111251831, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4504 + }, + { + "epoch": 3.5982428115015974, + "grad_norm": 0.10484769195318222, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4505 + }, + { + "epoch": 3.599041533546326, + "grad_norm": 0.08398665487766266, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4506 + }, + { + "epoch": 3.5998402555910545, + "grad_norm": 0.1644149124622345, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4507 + }, + { + "epoch": 3.600638977635783, + "grad_norm": 0.0803244560956955, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4508 + }, + { + "epoch": 3.601437699680511, + "grad_norm": 0.12512895464897156, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4509 + }, + { + "epoch": 3.6022364217252396, + "grad_norm": 0.1404576301574707, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4510 + }, + { + "epoch": 3.603035143769968, + "grad_norm": 0.10823316127061844, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4511 + }, + { + "epoch": 3.6038338658146962, + "grad_norm": 0.06985688954591751, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4512 + }, + { + "epoch": 3.6046325878594248, + "grad_norm": 0.1651264876127243, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4513 + }, + { + "epoch": 3.6054313099041533, + "grad_norm": 0.19752484560012817, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4514 + }, + { + "epoch": 3.606230031948882, + "grad_norm": 0.20005464553833008, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4515 + }, + { + "epoch": 3.6070287539936103, + "grad_norm": 0.1478145569562912, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4516 + }, + { + "epoch": 3.607827476038339, + "grad_norm": 0.05737901106476784, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4517 + }, + { + "epoch": 3.608626198083067, + "grad_norm": 0.16174650192260742, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4518 + }, + { + "epoch": 3.6094249201277955, + "grad_norm": 0.1959141194820404, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4519 + }, + { + "epoch": 3.610223642172524, + "grad_norm": 0.09767267853021622, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4520 + }, + { + "epoch": 3.6110223642172525, + "grad_norm": 0.10553760081529617, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4521 + }, + { + "epoch": 3.6118210862619806, + "grad_norm": 0.19380977749824524, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4522 + }, + { + "epoch": 3.612619808306709, + "grad_norm": 0.2024526745080948, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4523 + }, + { + "epoch": 3.6134185303514377, + "grad_norm": 0.09705837070941925, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4524 + }, + { + "epoch": 3.614217252396166, + "grad_norm": 0.12530986964702606, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4525 + }, + { + "epoch": 3.6150159744408947, + "grad_norm": 0.20901283621788025, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4526 + }, + { + "epoch": 3.6158146964856233, + "grad_norm": 0.16532309353351593, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4527 + }, + { + "epoch": 3.6166134185303513, + "grad_norm": 0.18353991210460663, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4528 + }, + { + "epoch": 3.61741214057508, + "grad_norm": 0.12912365794181824, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4529 + }, + { + "epoch": 3.6182108626198084, + "grad_norm": 0.2052653580904007, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4530 + }, + { + "epoch": 3.6190095846645365, + "grad_norm": 0.1395503133535385, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4531 + }, + { + "epoch": 3.619808306709265, + "grad_norm": 0.07939961552619934, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4532 + }, + { + "epoch": 3.6206070287539935, + "grad_norm": 0.10098318755626678, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4533 + }, + { + "epoch": 3.621405750798722, + "grad_norm": 0.14332561194896698, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4534 + }, + { + "epoch": 3.6222044728434506, + "grad_norm": 0.09697199612855911, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4535 + }, + { + "epoch": 3.623003194888179, + "grad_norm": 0.07785658538341522, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4536 + }, + { + "epoch": 3.623801916932907, + "grad_norm": 0.11263108998537064, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4537 + }, + { + "epoch": 3.6246006389776357, + "grad_norm": 0.18257030844688416, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4538 + }, + { + "epoch": 3.6253993610223643, + "grad_norm": 0.1456373631954193, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4539 + }, + { + "epoch": 3.626198083067093, + "grad_norm": 0.06831679493188858, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4540 + }, + { + "epoch": 3.626996805111821, + "grad_norm": 0.12324535846710205, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4541 + }, + { + "epoch": 3.6277955271565494, + "grad_norm": 0.15868282318115234, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4542 + }, + { + "epoch": 3.628594249201278, + "grad_norm": 0.09355167299509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4543 + }, + { + "epoch": 3.6293929712460065, + "grad_norm": 0.08047328144311905, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4544 + }, + { + "epoch": 3.630191693290735, + "grad_norm": 0.12683328986167908, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4545 + }, + { + "epoch": 3.6309904153354635, + "grad_norm": 0.11964920908212662, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4546 + }, + { + "epoch": 3.6317891373801916, + "grad_norm": 0.0504109226167202, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4547 + }, + { + "epoch": 3.63258785942492, + "grad_norm": 0.11909852921962738, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4548 + }, + { + "epoch": 3.6333865814696487, + "grad_norm": 0.16763992607593536, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4549 + }, + { + "epoch": 3.6341853035143767, + "grad_norm": 0.1486649513244629, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4550 + }, + { + "epoch": 3.6349840255591053, + "grad_norm": 0.06941305845975876, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4551 + }, + { + "epoch": 3.635782747603834, + "grad_norm": 0.1177566722035408, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4552 + }, + { + "epoch": 3.6365814696485623, + "grad_norm": 0.23368601500988007, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4553 + }, + { + "epoch": 3.637380191693291, + "grad_norm": 0.24657249450683594, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4554 + }, + { + "epoch": 3.6381789137380194, + "grad_norm": 0.10063605010509491, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4555 + }, + { + "epoch": 3.6389776357827475, + "grad_norm": 0.1553603708744049, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4556 + }, + { + "epoch": 3.639776357827476, + "grad_norm": 0.25588107109069824, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4557 + }, + { + "epoch": 3.6405750798722045, + "grad_norm": 0.15270236134529114, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4558 + }, + { + "epoch": 3.641373801916933, + "grad_norm": 0.108666330575943, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4559 + }, + { + "epoch": 3.642172523961661, + "grad_norm": 0.19828133285045624, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4560 + }, + { + "epoch": 3.6429712460063897, + "grad_norm": 0.21500051021575928, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4561 + }, + { + "epoch": 3.643769968051118, + "grad_norm": 0.16299934685230255, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4562 + }, + { + "epoch": 3.6445686900958467, + "grad_norm": 0.07390763610601425, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4563 + }, + { + "epoch": 3.6453674121405752, + "grad_norm": 0.22709119319915771, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4564 + }, + { + "epoch": 3.6461661341853038, + "grad_norm": 0.15557943284511566, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4565 + }, + { + "epoch": 3.646964856230032, + "grad_norm": 0.062457580119371414, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4566 + }, + { + "epoch": 3.6477635782747604, + "grad_norm": 0.09101095795631409, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4567 + }, + { + "epoch": 3.648562300319489, + "grad_norm": 0.08700825273990631, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 4568 + }, + { + "epoch": 3.649361022364217, + "grad_norm": 0.058703795075416565, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4569 + }, + { + "epoch": 3.6501597444089455, + "grad_norm": 0.056776538491249084, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4570 + }, + { + "epoch": 3.650958466453674, + "grad_norm": 0.062245409935712814, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4571 + }, + { + "epoch": 3.6517571884984026, + "grad_norm": 0.0534074492752552, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 4572 + }, + { + "epoch": 3.652555910543131, + "grad_norm": 0.09061384946107864, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4573 + }, + { + "epoch": 3.6533546325878596, + "grad_norm": 0.07323598116636276, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4574 + }, + { + "epoch": 3.6541533546325877, + "grad_norm": 0.1120329350233078, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 4575 + }, + { + "epoch": 3.6549520766773163, + "grad_norm": 0.07965485006570816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4576 + }, + { + "epoch": 3.655750798722045, + "grad_norm": 0.06320462375879288, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4577 + }, + { + "epoch": 3.6565495207667733, + "grad_norm": 0.07869421690702438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4578 + }, + { + "epoch": 3.6573482428115014, + "grad_norm": 0.09003151208162308, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4579 + }, + { + "epoch": 3.65814696485623, + "grad_norm": 0.05570388212800026, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4580 + }, + { + "epoch": 3.6589456869009584, + "grad_norm": 0.15563733875751495, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4581 + }, + { + "epoch": 3.659744408945687, + "grad_norm": 0.1422414481639862, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4582 + }, + { + "epoch": 3.6605431309904155, + "grad_norm": 0.13704177737236023, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4583 + }, + { + "epoch": 3.661341853035144, + "grad_norm": 0.36126458644866943, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4584 + }, + { + "epoch": 3.662140575079872, + "grad_norm": 0.09024632722139359, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4585 + }, + { + "epoch": 3.6629392971246006, + "grad_norm": 0.07135412096977234, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4586 + }, + { + "epoch": 3.663738019169329, + "grad_norm": 0.06172417849302292, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4587 + }, + { + "epoch": 3.6645367412140573, + "grad_norm": 0.05962595343589783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4588 + }, + { + "epoch": 3.665335463258786, + "grad_norm": 0.07063078880310059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4589 + }, + { + "epoch": 3.6661341853035143, + "grad_norm": 0.1445596069097519, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4590 + }, + { + "epoch": 3.666932907348243, + "grad_norm": 0.09224060922861099, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4591 + }, + { + "epoch": 3.6677316293929714, + "grad_norm": 0.10353037714958191, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4592 + }, + { + "epoch": 3.6685303514377, + "grad_norm": 0.10922796279191971, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4593 + }, + { + "epoch": 3.669329073482428, + "grad_norm": 0.08728764951229095, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4594 + }, + { + "epoch": 3.6701277955271565, + "grad_norm": 0.0639081671833992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4595 + }, + { + "epoch": 3.670926517571885, + "grad_norm": 0.050491299480199814, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4596 + }, + { + "epoch": 3.6717252396166136, + "grad_norm": 0.07127548009157181, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4597 + }, + { + "epoch": 3.6725239616613417, + "grad_norm": 0.05432606860995293, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4598 + }, + { + "epoch": 3.67332268370607, + "grad_norm": 0.0653342455625534, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4599 + }, + { + "epoch": 3.6741214057507987, + "grad_norm": 0.08766797184944153, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4600 + }, + { + "epoch": 3.6749201277955272, + "grad_norm": 0.0816602036356926, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4601 + }, + { + "epoch": 3.6757188498402558, + "grad_norm": 0.08774783462285995, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4602 + }, + { + "epoch": 3.6765175718849843, + "grad_norm": 0.07776570320129395, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4603 + }, + { + "epoch": 3.6773162939297124, + "grad_norm": 0.07067213952541351, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4604 + }, + { + "epoch": 3.678115015974441, + "grad_norm": 0.06581863760948181, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4605 + }, + { + "epoch": 3.6789137380191694, + "grad_norm": 0.08631278574466705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4606 + }, + { + "epoch": 3.6797124600638975, + "grad_norm": 0.10875384509563446, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4607 + }, + { + "epoch": 3.680511182108626, + "grad_norm": 0.11207764595746994, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4608 + }, + { + "epoch": 3.6813099041533546, + "grad_norm": 0.08943730592727661, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4609 + }, + { + "epoch": 3.682108626198083, + "grad_norm": 0.1922001987695694, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4610 + }, + { + "epoch": 3.6829073482428116, + "grad_norm": 0.10121189057826996, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4611 + }, + { + "epoch": 3.68370607028754, + "grad_norm": 0.05991055443882942, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4612 + }, + { + "epoch": 3.6845047923322682, + "grad_norm": 0.0897853821516037, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4613 + }, + { + "epoch": 3.6853035143769968, + "grad_norm": 0.13160353899002075, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4614 + }, + { + "epoch": 3.6861022364217253, + "grad_norm": 0.13855913281440735, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4615 + }, + { + "epoch": 3.686900958466454, + "grad_norm": 0.11086787283420563, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4616 + }, + { + "epoch": 3.687699680511182, + "grad_norm": 0.07992085069417953, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4617 + }, + { + "epoch": 3.6884984025559104, + "grad_norm": 0.11618958413600922, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4618 + }, + { + "epoch": 3.689297124600639, + "grad_norm": 0.19551296532154083, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4619 + }, + { + "epoch": 3.6900958466453675, + "grad_norm": 0.20239807665348053, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4620 + }, + { + "epoch": 3.690894568690096, + "grad_norm": 0.13233833014965057, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4621 + }, + { + "epoch": 3.6916932907348246, + "grad_norm": 0.08789848536252975, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4622 + }, + { + "epoch": 3.6924920127795526, + "grad_norm": 0.2363075315952301, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4623 + }, + { + "epoch": 3.693290734824281, + "grad_norm": 0.2585245668888092, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4624 + }, + { + "epoch": 3.6940894568690097, + "grad_norm": 0.15822109580039978, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4625 + }, + { + "epoch": 3.6948881789137378, + "grad_norm": 0.07197296619415283, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4626 + }, + { + "epoch": 3.6956869009584663, + "grad_norm": 0.21067900955677032, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4627 + }, + { + "epoch": 3.696485623003195, + "grad_norm": 0.19520802795886993, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4628 + }, + { + "epoch": 3.6972843450479234, + "grad_norm": 0.08310793340206146, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4629 + }, + { + "epoch": 3.698083067092652, + "grad_norm": 0.2118932604789734, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4630 + }, + { + "epoch": 3.6988817891373804, + "grad_norm": 0.2236505001783371, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4631 + }, + { + "epoch": 3.6996805111821085, + "grad_norm": 0.16256077587604523, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4632 + }, + { + "epoch": 3.700479233226837, + "grad_norm": 0.14406970143318176, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4633 + }, + { + "epoch": 3.7012779552715656, + "grad_norm": 0.09738676995038986, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4634 + }, + { + "epoch": 3.702076677316294, + "grad_norm": 0.07531408965587616, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4635 + }, + { + "epoch": 3.702875399361022, + "grad_norm": 0.11631188541650772, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4636 + }, + { + "epoch": 3.7036741214057507, + "grad_norm": 0.11661874502897263, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4637 + }, + { + "epoch": 3.7044728434504792, + "grad_norm": 0.11709950119256973, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 4638 + }, + { + "epoch": 3.7052715654952078, + "grad_norm": 0.13420704007148743, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4639 + }, + { + "epoch": 3.7060702875399363, + "grad_norm": 0.08842958509922028, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4640 + }, + { + "epoch": 3.706869009584665, + "grad_norm": 0.07295326143503189, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4641 + }, + { + "epoch": 3.707667731629393, + "grad_norm": 0.14573390781879425, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4642 + }, + { + "epoch": 3.7084664536741214, + "grad_norm": 0.06639868766069412, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4643 + }, + { + "epoch": 3.70926517571885, + "grad_norm": 0.05936001241207123, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4644 + }, + { + "epoch": 3.710063897763578, + "grad_norm": 0.06534209847450256, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4645 + }, + { + "epoch": 3.7108626198083066, + "grad_norm": 0.13101834058761597, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4646 + }, + { + "epoch": 3.711661341853035, + "grad_norm": 0.07707498222589493, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4647 + }, + { + "epoch": 3.7124600638977636, + "grad_norm": 0.09272165596485138, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4648 + }, + { + "epoch": 3.713258785942492, + "grad_norm": 0.12538838386535645, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4649 + }, + { + "epoch": 3.7140575079872207, + "grad_norm": 0.10816318541765213, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4650 + }, + { + "epoch": 3.7148562300319488, + "grad_norm": 0.10610290616750717, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4651 + }, + { + "epoch": 3.7156549520766773, + "grad_norm": 0.09520592540502548, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4652 + }, + { + "epoch": 3.716453674121406, + "grad_norm": 0.05595150217413902, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4653 + }, + { + "epoch": 3.7172523961661343, + "grad_norm": 0.08114545047283173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4654 + }, + { + "epoch": 3.7180511182108624, + "grad_norm": 0.16090086102485657, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4655 + }, + { + "epoch": 3.718849840255591, + "grad_norm": 0.16332058608531952, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4656 + }, + { + "epoch": 3.7196485623003195, + "grad_norm": 0.17694437503814697, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4657 + }, + { + "epoch": 3.720447284345048, + "grad_norm": 0.16341771185398102, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4658 + }, + { + "epoch": 3.7212460063897765, + "grad_norm": 0.12268038839101791, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4659 + }, + { + "epoch": 3.722044728434505, + "grad_norm": 0.09971031546592712, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4660 + }, + { + "epoch": 3.722843450479233, + "grad_norm": 0.08546486496925354, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4661 + }, + { + "epoch": 3.7236421725239617, + "grad_norm": 0.15427617728710175, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4662 + }, + { + "epoch": 3.72444089456869, + "grad_norm": 0.1291000247001648, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4663 + }, + { + "epoch": 3.7252396166134183, + "grad_norm": 0.06823746860027313, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4664 + }, + { + "epoch": 3.726038338658147, + "grad_norm": 0.08133388310670853, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4665 + }, + { + "epoch": 3.7268370607028753, + "grad_norm": 0.08803416788578033, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4666 + }, + { + "epoch": 3.727635782747604, + "grad_norm": 0.05898858234286308, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4667 + }, + { + "epoch": 3.7284345047923324, + "grad_norm": 0.07650687545537949, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 4668 + }, + { + "epoch": 3.729233226837061, + "grad_norm": 0.15048138797283173, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4669 + }, + { + "epoch": 3.730031948881789, + "grad_norm": 0.08594254404306412, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4670 + }, + { + "epoch": 3.7308306709265175, + "grad_norm": 0.05322937294840813, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4671 + }, + { + "epoch": 3.731629392971246, + "grad_norm": 0.14541727304458618, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4672 + }, + { + "epoch": 3.7324281150159746, + "grad_norm": 0.10300826281309128, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4673 + }, + { + "epoch": 3.7332268370607027, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4674 + }, + { + "epoch": 3.734025559105431, + "grad_norm": 0.07101032137870789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4675 + }, + { + "epoch": 3.7348242811501597, + "grad_norm": 0.09166763722896576, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4676 + }, + { + "epoch": 3.7356230031948883, + "grad_norm": 0.06929054856300354, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4677 + }, + { + "epoch": 3.736421725239617, + "grad_norm": 0.05935844033956528, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4678 + }, + { + "epoch": 3.737220447284345, + "grad_norm": 0.09101571142673492, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4679 + }, + { + "epoch": 3.7380191693290734, + "grad_norm": 0.0979514792561531, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4680 + }, + { + "epoch": 3.738817891373802, + "grad_norm": 0.07105522602796555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4681 + }, + { + "epoch": 3.7396166134185305, + "grad_norm": 0.05741708725690842, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4682 + }, + { + "epoch": 3.7404153354632586, + "grad_norm": 0.051515400409698486, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4683 + }, + { + "epoch": 3.741214057507987, + "grad_norm": 0.06484496593475342, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4684 + }, + { + "epoch": 3.7420127795527156, + "grad_norm": 0.056751761585474014, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4685 + }, + { + "epoch": 3.742811501597444, + "grad_norm": 0.09628041833639145, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4686 + }, + { + "epoch": 3.7436102236421727, + "grad_norm": 0.13367851078510284, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 4687 + }, + { + "epoch": 3.744408945686901, + "grad_norm": 0.10439570248126984, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4688 + }, + { + "epoch": 3.7452076677316293, + "grad_norm": 0.05516012758016586, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4689 + }, + { + "epoch": 3.746006389776358, + "grad_norm": 0.0721910372376442, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 4690 + }, + { + "epoch": 3.7468051118210863, + "grad_norm": 0.10327166318893433, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4691 + }, + { + "epoch": 3.747603833865815, + "grad_norm": 0.10419414937496185, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4692 + }, + { + "epoch": 3.748402555910543, + "grad_norm": 0.07322157919406891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4693 + }, + { + "epoch": 3.7492012779552715, + "grad_norm": 0.05000368133187294, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4694 + }, + { + "epoch": 3.75, + "grad_norm": 0.055239707231521606, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4695 + }, + { + "epoch": 3.7507987220447285, + "grad_norm": 0.14060117304325104, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 4696 + }, + { + "epoch": 3.751597444089457, + "grad_norm": 0.1366022527217865, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4697 + }, + { + "epoch": 3.752396166134185, + "grad_norm": 0.15003731846809387, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4698 + }, + { + "epoch": 3.7531948881789137, + "grad_norm": 0.11602472513914108, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4699 + }, + { + "epoch": 3.753993610223642, + "grad_norm": 0.06956090778112411, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4700 + }, + { + "epoch": 3.7547923322683707, + "grad_norm": 0.04711974412202835, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4701 + }, + { + "epoch": 3.755591054313099, + "grad_norm": 0.09257466346025467, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4702 + }, + { + "epoch": 3.7563897763578273, + "grad_norm": 0.06598426401615143, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4703 + }, + { + "epoch": 3.757188498402556, + "grad_norm": 0.06239036098122597, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4704 + }, + { + "epoch": 3.7579872204472844, + "grad_norm": 0.10065969824790955, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4705 + }, + { + "epoch": 3.758785942492013, + "grad_norm": 0.12874993681907654, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4706 + }, + { + "epoch": 3.7595846645367414, + "grad_norm": 0.10291960090398788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4707 + }, + { + "epoch": 3.7603833865814695, + "grad_norm": 0.06138000637292862, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4708 + }, + { + "epoch": 3.761182108626198, + "grad_norm": 0.11565262079238892, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4709 + }, + { + "epoch": 3.7619808306709266, + "grad_norm": 0.08041521906852722, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4710 + }, + { + "epoch": 3.762779552715655, + "grad_norm": 0.07228218764066696, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4711 + }, + { + "epoch": 3.763578274760383, + "grad_norm": 0.09155906736850739, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4712 + }, + { + "epoch": 3.7643769968051117, + "grad_norm": 0.07468429207801819, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4713 + }, + { + "epoch": 3.7651757188498403, + "grad_norm": 0.07629574090242386, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4714 + }, + { + "epoch": 3.765974440894569, + "grad_norm": 0.1118689477443695, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4715 + }, + { + "epoch": 3.7667731629392973, + "grad_norm": 0.07983580976724625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4716 + }, + { + "epoch": 3.7675718849840254, + "grad_norm": 0.07225694507360458, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4717 + }, + { + "epoch": 3.768370607028754, + "grad_norm": 0.1322079598903656, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4718 + }, + { + "epoch": 3.7691693290734825, + "grad_norm": 0.17217211425304413, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4719 + }, + { + "epoch": 3.769968051118211, + "grad_norm": 0.14665336906909943, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4720 + }, + { + "epoch": 3.770766773162939, + "grad_norm": 0.09977035969495773, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4721 + }, + { + "epoch": 3.7715654952076676, + "grad_norm": 0.1346946358680725, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4722 + }, + { + "epoch": 3.772364217252396, + "grad_norm": 0.17330871522426605, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 4723 + }, + { + "epoch": 3.7731629392971247, + "grad_norm": 0.17789506912231445, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4724 + }, + { + "epoch": 3.773961661341853, + "grad_norm": 0.06285518407821655, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4725 + }, + { + "epoch": 3.7747603833865817, + "grad_norm": 0.13192926347255707, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4726 + }, + { + "epoch": 3.77555910543131, + "grad_norm": 0.12157132476568222, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4727 + }, + { + "epoch": 3.7763578274760383, + "grad_norm": 0.1203337088227272, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4728 + }, + { + "epoch": 3.777156549520767, + "grad_norm": 0.16711866855621338, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4729 + }, + { + "epoch": 3.777955271565495, + "grad_norm": 0.13596504926681519, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4730 + }, + { + "epoch": 3.7787539936102235, + "grad_norm": 0.13502761721611023, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4731 + }, + { + "epoch": 3.779552715654952, + "grad_norm": 0.0751141607761383, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4732 + }, + { + "epoch": 3.7803514376996805, + "grad_norm": 0.1104620099067688, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4733 + }, + { + "epoch": 3.781150159744409, + "grad_norm": 0.06397949904203415, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4734 + }, + { + "epoch": 3.7819488817891376, + "grad_norm": 0.07850230485200882, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 4735 + }, + { + "epoch": 3.7827476038338657, + "grad_norm": 0.10330549627542496, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4736 + }, + { + "epoch": 3.783546325878594, + "grad_norm": 0.08978938311338425, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4737 + }, + { + "epoch": 3.7843450479233227, + "grad_norm": 0.07073058933019638, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4738 + }, + { + "epoch": 3.7851437699680512, + "grad_norm": 0.05997786670923233, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4739 + }, + { + "epoch": 3.7859424920127793, + "grad_norm": 0.0779404565691948, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4740 + }, + { + "epoch": 3.786741214057508, + "grad_norm": 0.1367640644311905, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4741 + }, + { + "epoch": 3.7875399361022364, + "grad_norm": 0.08670534938573837, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4742 + }, + { + "epoch": 3.788338658146965, + "grad_norm": 0.08612547069787979, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4743 + }, + { + "epoch": 3.7891373801916934, + "grad_norm": 0.06312929093837738, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4744 + }, + { + "epoch": 3.789936102236422, + "grad_norm": 0.06397293508052826, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4745 + }, + { + "epoch": 3.79073482428115, + "grad_norm": 0.0663115605711937, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4746 + }, + { + "epoch": 3.7915335463258786, + "grad_norm": 0.07580576092004776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4747 + }, + { + "epoch": 3.792332268370607, + "grad_norm": 0.12604761123657227, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4748 + }, + { + "epoch": 3.793130990415335, + "grad_norm": 0.08900050073862076, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4749 + }, + { + "epoch": 3.7939297124600637, + "grad_norm": 0.09280730038881302, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4750 + }, + { + "epoch": 3.7947284345047922, + "grad_norm": 0.17689163982868195, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 4751 + }, + { + "epoch": 3.7955271565495208, + "grad_norm": 0.06348183006048203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4752 + }, + { + "epoch": 3.7963258785942493, + "grad_norm": 0.12626387178897858, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 4753 + }, + { + "epoch": 3.797124600638978, + "grad_norm": 0.1138390377163887, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4754 + }, + { + "epoch": 3.797923322683706, + "grad_norm": 0.08058728277683258, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4755 + }, + { + "epoch": 3.7987220447284344, + "grad_norm": 0.09671882539987564, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4756 + }, + { + "epoch": 3.799520766773163, + "grad_norm": 0.12193922698497772, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4757 + }, + { + "epoch": 3.8003194888178915, + "grad_norm": 0.31105268001556396, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4758 + }, + { + "epoch": 3.8011182108626196, + "grad_norm": 0.10482051223516464, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4759 + }, + { + "epoch": 3.801916932907348, + "grad_norm": 0.09116382896900177, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4760 + }, + { + "epoch": 3.8027156549520766, + "grad_norm": 0.08212421089410782, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4761 + }, + { + "epoch": 3.803514376996805, + "grad_norm": 0.08267461508512497, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4762 + }, + { + "epoch": 3.8043130990415337, + "grad_norm": 0.13247907161712646, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4763 + }, + { + "epoch": 3.8051118210862622, + "grad_norm": 0.1083490327000618, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 4764 + }, + { + "epoch": 3.8059105431309903, + "grad_norm": 0.11947019398212433, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4765 + }, + { + "epoch": 3.806709265175719, + "grad_norm": 0.08462221175432205, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4766 + }, + { + "epoch": 3.8075079872204474, + "grad_norm": 0.07244928181171417, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4767 + }, + { + "epoch": 3.8083067092651754, + "grad_norm": 0.13432611525058746, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4768 + }, + { + "epoch": 3.809105431309904, + "grad_norm": 0.16640888154506683, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4769 + }, + { + "epoch": 3.8099041533546325, + "grad_norm": 0.12189232558012009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4770 + }, + { + "epoch": 3.810702875399361, + "grad_norm": 0.052367180585861206, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4771 + }, + { + "epoch": 3.8115015974440896, + "grad_norm": 0.10426424443721771, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4772 + }, + { + "epoch": 3.812300319488818, + "grad_norm": 0.11365417391061783, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4773 + }, + { + "epoch": 3.813099041533546, + "grad_norm": 0.07064168155193329, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4774 + }, + { + "epoch": 3.8138977635782747, + "grad_norm": 0.2107549011707306, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4775 + }, + { + "epoch": 3.8146964856230032, + "grad_norm": 0.2984449565410614, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4776 + }, + { + "epoch": 3.8154952076677318, + "grad_norm": 0.26252058148384094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4777 + }, + { + "epoch": 3.81629392971246, + "grad_norm": 0.08128907531499863, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4778 + }, + { + "epoch": 3.8170926517571884, + "grad_norm": 0.2724008858203888, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4779 + }, + { + "epoch": 3.817891373801917, + "grad_norm": 0.2646482288837433, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4780 + }, + { + "epoch": 3.8186900958466454, + "grad_norm": 0.16063876450061798, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4781 + }, + { + "epoch": 3.819488817891374, + "grad_norm": 0.11671862006187439, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4782 + }, + { + "epoch": 3.8202875399361025, + "grad_norm": 0.21605245769023895, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4783 + }, + { + "epoch": 3.8210862619808306, + "grad_norm": 0.17344583570957184, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4784 + }, + { + "epoch": 3.821884984025559, + "grad_norm": 0.08113347738981247, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4785 + }, + { + "epoch": 3.8226837060702876, + "grad_norm": 0.11774581670761108, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4786 + }, + { + "epoch": 3.8234824281150157, + "grad_norm": 0.2024560272693634, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4787 + }, + { + "epoch": 3.8242811501597442, + "grad_norm": 0.5578162670135498, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 4788 + }, + { + "epoch": 3.8250798722044728, + "grad_norm": 0.10354574024677277, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4789 + }, + { + "epoch": 3.8258785942492013, + "grad_norm": 0.14583979547023773, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4790 + }, + { + "epoch": 3.82667731629393, + "grad_norm": 0.15853755176067352, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4791 + }, + { + "epoch": 3.8274760383386583, + "grad_norm": 0.1308104395866394, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4792 + }, + { + "epoch": 3.8282747603833864, + "grad_norm": 0.04385368898510933, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4793 + }, + { + "epoch": 3.829073482428115, + "grad_norm": 0.16213825345039368, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4794 + }, + { + "epoch": 3.8298722044728435, + "grad_norm": 0.2693546414375305, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4795 + }, + { + "epoch": 3.830670926517572, + "grad_norm": 0.23904170095920563, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4796 + }, + { + "epoch": 3.8314696485623, + "grad_norm": 0.11313450336456299, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4797 + }, + { + "epoch": 3.8322683706070286, + "grad_norm": 0.0770820751786232, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4798 + }, + { + "epoch": 3.833067092651757, + "grad_norm": 0.8537606596946716, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4799 + }, + { + "epoch": 3.8338658146964857, + "grad_norm": 0.13684043288230896, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4800 + }, + { + "epoch": 3.834664536741214, + "grad_norm": 0.0890694409608841, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 4801 + }, + { + "epoch": 3.8354632587859427, + "grad_norm": 0.060917336493730545, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4802 + }, + { + "epoch": 3.836261980830671, + "grad_norm": 0.13864673674106598, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4803 + }, + { + "epoch": 3.8370607028753994, + "grad_norm": 0.15316139161586761, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4804 + }, + { + "epoch": 3.837859424920128, + "grad_norm": 0.061508018523454666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4805 + }, + { + "epoch": 3.838658146964856, + "grad_norm": 0.126112699508667, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4806 + }, + { + "epoch": 3.8394568690095845, + "grad_norm": 0.1663133054971695, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4807 + }, + { + "epoch": 3.840255591054313, + "grad_norm": 0.14435894787311554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4808 + }, + { + "epoch": 3.8410543130990416, + "grad_norm": 0.06042332574725151, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4809 + }, + { + "epoch": 3.84185303514377, + "grad_norm": 0.12759631872177124, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4810 + }, + { + "epoch": 3.8426517571884986, + "grad_norm": 0.18153302371501923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4811 + }, + { + "epoch": 3.8434504792332267, + "grad_norm": 0.1280708760023117, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4812 + }, + { + "epoch": 3.844249201277955, + "grad_norm": 0.07144157588481903, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4813 + }, + { + "epoch": 3.8450479233226837, + "grad_norm": 0.13078796863555908, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4814 + }, + { + "epoch": 3.8458466453674123, + "grad_norm": 0.16230762004852295, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4815 + }, + { + "epoch": 3.8466453674121404, + "grad_norm": 0.10997766256332397, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4816 + }, + { + "epoch": 3.847444089456869, + "grad_norm": 0.06006971001625061, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4817 + }, + { + "epoch": 3.8482428115015974, + "grad_norm": 0.10155797749757767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4818 + }, + { + "epoch": 3.849041533546326, + "grad_norm": 0.11125919967889786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4819 + }, + { + "epoch": 3.8498402555910545, + "grad_norm": 0.0860416367650032, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 4820 + }, + { + "epoch": 3.850638977635783, + "grad_norm": 0.0862870067358017, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4821 + }, + { + "epoch": 3.851437699680511, + "grad_norm": 0.07229744642972946, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4822 + }, + { + "epoch": 3.8522364217252396, + "grad_norm": 0.10448424518108368, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4823 + }, + { + "epoch": 3.853035143769968, + "grad_norm": 0.08971705287694931, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 4824 + }, + { + "epoch": 3.8538338658146962, + "grad_norm": 0.09876695275306702, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4825 + }, + { + "epoch": 3.8546325878594248, + "grad_norm": 0.0667971819639206, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4826 + }, + { + "epoch": 3.8554313099041533, + "grad_norm": 0.14437620341777802, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4827 + }, + { + "epoch": 3.856230031948882, + "grad_norm": 0.17627735435962677, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4828 + }, + { + "epoch": 3.8570287539936103, + "grad_norm": 0.10524439066648483, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4829 + }, + { + "epoch": 3.857827476038339, + "grad_norm": 0.15091893076896667, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4830 + }, + { + "epoch": 3.858626198083067, + "grad_norm": 0.22534102201461792, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4831 + }, + { + "epoch": 3.8594249201277955, + "grad_norm": 0.08298768103122711, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4832 + }, + { + "epoch": 3.860223642172524, + "grad_norm": 0.16647395491600037, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4833 + }, + { + "epoch": 3.8610223642172525, + "grad_norm": 0.22512534260749817, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4834 + }, + { + "epoch": 3.8618210862619806, + "grad_norm": 0.2130710482597351, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4835 + }, + { + "epoch": 3.862619808306709, + "grad_norm": 0.1250864863395691, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4836 + }, + { + "epoch": 3.8634185303514377, + "grad_norm": 0.13937048614025116, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4837 + }, + { + "epoch": 3.864217252396166, + "grad_norm": 0.19059741497039795, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4838 + }, + { + "epoch": 3.8650159744408947, + "grad_norm": 0.22080829739570618, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4839 + }, + { + "epoch": 3.8658146964856233, + "grad_norm": 0.09463749825954437, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 4840 + }, + { + "epoch": 3.8666134185303513, + "grad_norm": 0.16431698203086853, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4841 + }, + { + "epoch": 3.86741214057508, + "grad_norm": 0.2162260264158249, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4842 + }, + { + "epoch": 3.8682108626198084, + "grad_norm": 0.0789603665471077, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4843 + }, + { + "epoch": 3.8690095846645365, + "grad_norm": 0.18372099101543427, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4844 + }, + { + "epoch": 3.869808306709265, + "grad_norm": 0.24845194816589355, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4845 + }, + { + "epoch": 3.8706070287539935, + "grad_norm": 0.22064632177352905, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4846 + }, + { + "epoch": 3.871405750798722, + "grad_norm": 0.0718264952301979, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4847 + }, + { + "epoch": 3.8722044728434506, + "grad_norm": 0.2048031985759735, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4848 + }, + { + "epoch": 3.873003194888179, + "grad_norm": 0.23190200328826904, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4849 + }, + { + "epoch": 3.873801916932907, + "grad_norm": 0.06851150840520859, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4850 + }, + { + "epoch": 3.8746006389776357, + "grad_norm": 0.2371164858341217, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4851 + }, + { + "epoch": 3.8753993610223643, + "grad_norm": 0.23518243432044983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4852 + }, + { + "epoch": 3.876198083067093, + "grad_norm": 0.08026961237192154, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4853 + }, + { + "epoch": 3.876996805111821, + "grad_norm": 0.1623634397983551, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4854 + }, + { + "epoch": 3.8777955271565494, + "grad_norm": 0.21676453948020935, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4855 + }, + { + "epoch": 3.878594249201278, + "grad_norm": 0.07868681848049164, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4856 + }, + { + "epoch": 3.8793929712460065, + "grad_norm": 0.18302997946739197, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4857 + }, + { + "epoch": 3.880191693290735, + "grad_norm": 0.2338407188653946, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4858 + }, + { + "epoch": 3.8809904153354635, + "grad_norm": 0.2534898817539215, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4859 + }, + { + "epoch": 3.8817891373801916, + "grad_norm": 0.19988521933555603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4860 + }, + { + "epoch": 3.88258785942492, + "grad_norm": 0.2896076440811157, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4861 + }, + { + "epoch": 3.8833865814696487, + "grad_norm": 0.1088651567697525, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4862 + }, + { + "epoch": 3.8841853035143767, + "grad_norm": 0.18549342453479767, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4863 + }, + { + "epoch": 3.8849840255591053, + "grad_norm": 0.24760019779205322, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4864 + }, + { + "epoch": 3.885782747603834, + "grad_norm": 0.1323750913143158, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4865 + }, + { + "epoch": 3.8865814696485623, + "grad_norm": 0.14235283434391022, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4866 + }, + { + "epoch": 3.887380191693291, + "grad_norm": 0.20409083366394043, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4867 + }, + { + "epoch": 3.8881789137380194, + "grad_norm": 0.1743297129869461, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4868 + }, + { + "epoch": 3.8889776357827475, + "grad_norm": 0.09692966938018799, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4869 + }, + { + "epoch": 3.889776357827476, + "grad_norm": 0.09934467077255249, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4870 + }, + { + "epoch": 3.8905750798722045, + "grad_norm": 0.2410827875137329, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4871 + }, + { + "epoch": 3.891373801916933, + "grad_norm": 0.27096229791641235, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4872 + }, + { + "epoch": 3.892172523961661, + "grad_norm": 0.09133906662464142, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4873 + }, + { + "epoch": 3.8929712460063897, + "grad_norm": 0.20275604724884033, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4874 + }, + { + "epoch": 3.893769968051118, + "grad_norm": 0.19578030705451965, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4875 + }, + { + "epoch": 3.8945686900958467, + "grad_norm": 0.12888970971107483, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4876 + }, + { + "epoch": 3.8953674121405752, + "grad_norm": 0.10301528871059418, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4877 + }, + { + "epoch": 3.8961661341853038, + "grad_norm": 0.1635914444923401, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4878 + }, + { + "epoch": 3.896964856230032, + "grad_norm": 0.1971803456544876, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4879 + }, + { + "epoch": 3.8977635782747604, + "grad_norm": 0.1085273027420044, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4880 + }, + { + "epoch": 3.898562300319489, + "grad_norm": 0.07375707477331161, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4881 + }, + { + "epoch": 3.899361022364217, + "grad_norm": 0.5828747153282166, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4882 + }, + { + "epoch": 3.9001597444089455, + "grad_norm": 0.10320120304822922, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4883 + }, + { + "epoch": 3.900958466453674, + "grad_norm": 0.10118676722049713, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4884 + }, + { + "epoch": 3.9017571884984026, + "grad_norm": 0.22034543752670288, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4885 + }, + { + "epoch": 3.902555910543131, + "grad_norm": 0.21823646128177643, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4886 + }, + { + "epoch": 3.9033546325878596, + "grad_norm": 0.14776065945625305, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4887 + }, + { + "epoch": 3.9041533546325877, + "grad_norm": 0.13297663629055023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4888 + }, + { + "epoch": 3.9049520766773163, + "grad_norm": 0.4447253942489624, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4889 + }, + { + "epoch": 3.905750798722045, + "grad_norm": 0.171112522482872, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4890 + }, + { + "epoch": 3.9065495207667733, + "grad_norm": 0.1581616848707199, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4891 + }, + { + "epoch": 3.9073482428115014, + "grad_norm": 0.18396562337875366, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4892 + }, + { + "epoch": 3.90814696485623, + "grad_norm": 0.15952393412590027, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4893 + }, + { + "epoch": 3.9089456869009584, + "grad_norm": 0.12889564037322998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4894 + }, + { + "epoch": 3.909744408945687, + "grad_norm": 0.130104660987854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4895 + }, + { + "epoch": 3.9105431309904155, + "grad_norm": 0.13011464476585388, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4896 + }, + { + "epoch": 3.911341853035144, + "grad_norm": 0.06485363095998764, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4897 + }, + { + "epoch": 3.912140575079872, + "grad_norm": 0.11353932321071625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4898 + }, + { + "epoch": 3.9129392971246006, + "grad_norm": 0.13279879093170166, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4899 + }, + { + "epoch": 3.913738019169329, + "grad_norm": 0.19181469082832336, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 4900 + }, + { + "epoch": 3.9145367412140573, + "grad_norm": 0.06930892914533615, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4901 + }, + { + "epoch": 3.915335463258786, + "grad_norm": 0.10591714829206467, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4902 + }, + { + "epoch": 3.9161341853035143, + "grad_norm": 0.09693296998739243, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4903 + }, + { + "epoch": 3.916932907348243, + "grad_norm": 0.1604270488023758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4904 + }, + { + "epoch": 3.9177316293929714, + "grad_norm": 0.19874586164951324, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4905 + }, + { + "epoch": 3.9185303514377, + "grad_norm": 0.09015987068414688, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4906 + }, + { + "epoch": 3.919329073482428, + "grad_norm": 0.09864864498376846, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4907 + }, + { + "epoch": 3.9201277955271565, + "grad_norm": 0.12509673833847046, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4908 + }, + { + "epoch": 3.920926517571885, + "grad_norm": 0.10216362774372101, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4909 + }, + { + "epoch": 3.9217252396166136, + "grad_norm": 0.11854741722345352, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4910 + }, + { + "epoch": 3.9225239616613417, + "grad_norm": 0.08570919930934906, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4911 + }, + { + "epoch": 3.92332268370607, + "grad_norm": 0.095781609416008, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4912 + }, + { + "epoch": 3.9241214057507987, + "grad_norm": 0.05698491260409355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4913 + }, + { + "epoch": 3.9249201277955272, + "grad_norm": 0.09786297380924225, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4914 + }, + { + "epoch": 3.9257188498402558, + "grad_norm": 0.1206512302160263, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4915 + }, + { + "epoch": 3.9265175718849843, + "grad_norm": 0.07593982666730881, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4916 + }, + { + "epoch": 3.9273162939297124, + "grad_norm": 0.06973730027675629, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4917 + }, + { + "epoch": 3.928115015974441, + "grad_norm": 0.07377546280622482, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4918 + }, + { + "epoch": 3.9289137380191694, + "grad_norm": 0.06871537119150162, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4919 + }, + { + "epoch": 3.9297124600638975, + "grad_norm": 0.09697525203227997, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4920 + }, + { + "epoch": 3.930511182108626, + "grad_norm": 0.07418478280305862, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4921 + }, + { + "epoch": 3.9313099041533546, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4922 + }, + { + "epoch": 3.932108626198083, + "grad_norm": 0.08099815994501114, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4923 + }, + { + "epoch": 3.9329073482428116, + "grad_norm": 0.08033913373947144, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4924 + }, + { + "epoch": 3.93370607028754, + "grad_norm": 0.1089775413274765, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4925 + }, + { + "epoch": 3.9345047923322682, + "grad_norm": 0.06866748631000519, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4926 + }, + { + "epoch": 3.9353035143769968, + "grad_norm": 0.12346489727497101, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4927 + }, + { + "epoch": 3.9361022364217253, + "grad_norm": 0.1388891190290451, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4928 + }, + { + "epoch": 3.936900958466454, + "grad_norm": 0.12678411602973938, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4929 + }, + { + "epoch": 3.937699680511182, + "grad_norm": 0.08638305962085724, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4930 + }, + { + "epoch": 3.9384984025559104, + "grad_norm": 0.667020320892334, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 4931 + }, + { + "epoch": 3.939297124600639, + "grad_norm": 0.0867542177438736, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4932 + }, + { + "epoch": 3.9400958466453675, + "grad_norm": 0.1075657457113266, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4933 + }, + { + "epoch": 3.940894568690096, + "grad_norm": 0.10359356552362442, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4934 + }, + { + "epoch": 3.9416932907348246, + "grad_norm": 0.04861772805452347, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4935 + }, + { + "epoch": 3.9424920127795526, + "grad_norm": 0.08871651440858841, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4936 + }, + { + "epoch": 3.943290734824281, + "grad_norm": 0.05268944799900055, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4937 + }, + { + "epoch": 3.9440894568690097, + "grad_norm": 0.11428069323301315, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4938 + }, + { + "epoch": 3.9448881789137378, + "grad_norm": 0.1302616149187088, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4939 + }, + { + "epoch": 3.9456869009584663, + "grad_norm": 0.09091098606586456, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4940 + }, + { + "epoch": 3.946485623003195, + "grad_norm": 0.23224923014640808, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4941 + }, + { + "epoch": 3.9472843450479234, + "grad_norm": 0.13427230715751648, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4942 + }, + { + "epoch": 3.948083067092652, + "grad_norm": 0.24157744646072388, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4943 + }, + { + "epoch": 3.9488817891373804, + "grad_norm": 0.15497569739818573, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4944 + }, + { + "epoch": 3.9496805111821085, + "grad_norm": 0.15587151050567627, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4945 + }, + { + "epoch": 3.950479233226837, + "grad_norm": 0.0827038437128067, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4946 + }, + { + "epoch": 3.9512779552715656, + "grad_norm": 0.17405007779598236, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4947 + }, + { + "epoch": 3.952076677316294, + "grad_norm": 0.1612532138824463, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4948 + }, + { + "epoch": 3.952875399361022, + "grad_norm": 0.07505665719509125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4949 + }, + { + "epoch": 3.9536741214057507, + "grad_norm": 0.07138567417860031, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4950 + }, + { + "epoch": 3.9544728434504792, + "grad_norm": 0.09206511080265045, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4951 + }, + { + "epoch": 3.9552715654952078, + "grad_norm": 0.09190725535154343, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4952 + }, + { + "epoch": 3.9560702875399363, + "grad_norm": 0.13024544715881348, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4953 + }, + { + "epoch": 3.956869009584665, + "grad_norm": 0.08161026239395142, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4954 + }, + { + "epoch": 3.957667731629393, + "grad_norm": 0.17207187414169312, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4955 + }, + { + "epoch": 3.9584664536741214, + "grad_norm": 0.096051886677742, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4956 + }, + { + "epoch": 3.95926517571885, + "grad_norm": 0.11038299649953842, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4957 + }, + { + "epoch": 3.960063897763578, + "grad_norm": 0.09957583248615265, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4958 + }, + { + "epoch": 3.9608626198083066, + "grad_norm": 0.06923667341470718, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4959 + }, + { + "epoch": 3.961661341853035, + "grad_norm": 0.07572069019079208, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4960 + }, + { + "epoch": 3.9624600638977636, + "grad_norm": 0.16801652312278748, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4961 + }, + { + "epoch": 3.963258785942492, + "grad_norm": 0.062117498368024826, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4962 + }, + { + "epoch": 3.9640575079872207, + "grad_norm": 0.08293396979570389, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4963 + }, + { + "epoch": 3.9648562300319488, + "grad_norm": 0.2021675407886505, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4964 + }, + { + "epoch": 3.9656549520766773, + "grad_norm": 0.10666973143815994, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4965 + }, + { + "epoch": 3.966453674121406, + "grad_norm": 0.09226572513580322, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4966 + }, + { + "epoch": 3.9672523961661343, + "grad_norm": 0.10113741457462311, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4967 + }, + { + "epoch": 3.9680511182108624, + "grad_norm": 0.10156626254320145, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4968 + }, + { + "epoch": 3.968849840255591, + "grad_norm": 0.08531442284584045, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4969 + }, + { + "epoch": 3.9696485623003195, + "grad_norm": 0.08894761651754379, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4970 + }, + { + "epoch": 3.970447284345048, + "grad_norm": 0.07934322953224182, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4971 + }, + { + "epoch": 3.9712460063897765, + "grad_norm": 0.07121701538562775, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4972 + }, + { + "epoch": 3.972044728434505, + "grad_norm": 0.09110251814126968, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4973 + }, + { + "epoch": 3.972843450479233, + "grad_norm": 0.09724952280521393, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4974 + }, + { + "epoch": 3.9736421725239617, + "grad_norm": 0.08619683235883713, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4975 + }, + { + "epoch": 3.97444089456869, + "grad_norm": 0.14789989590644836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4976 + }, + { + "epoch": 3.9752396166134183, + "grad_norm": 0.08736634254455566, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4977 + }, + { + "epoch": 3.976038338658147, + "grad_norm": 0.2260635793209076, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4978 + }, + { + "epoch": 3.9768370607028753, + "grad_norm": 0.2150910496711731, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4979 + }, + { + "epoch": 3.977635782747604, + "grad_norm": 0.12071242183446884, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4980 + }, + { + "epoch": 3.9784345047923324, + "grad_norm": 0.11614276468753815, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4981 + }, + { + "epoch": 3.979233226837061, + "grad_norm": 0.0954839214682579, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4982 + }, + { + "epoch": 3.980031948881789, + "grad_norm": 0.09801400452852249, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4983 + }, + { + "epoch": 3.9808306709265175, + "grad_norm": 0.07435343414545059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4984 + }, + { + "epoch": 3.981629392971246, + "grad_norm": 0.09401766955852509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4985 + }, + { + "epoch": 3.9824281150159746, + "grad_norm": 0.09850753843784332, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4986 + }, + { + "epoch": 3.9832268370607027, + "grad_norm": 0.07880235463380814, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4987 + }, + { + "epoch": 3.984025559105431, + "grad_norm": 0.08208848536014557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4988 + }, + { + "epoch": 3.9848242811501597, + "grad_norm": 0.10432668030261993, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4989 + }, + { + "epoch": 3.9856230031948883, + "grad_norm": 0.05202944204211235, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4990 + }, + { + "epoch": 3.986421725239617, + "grad_norm": 0.0831860601902008, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4991 + }, + { + "epoch": 3.987220447284345, + "grad_norm": 0.1084689050912857, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4992 + }, + { + "epoch": 3.9880191693290734, + "grad_norm": 0.1095893383026123, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4993 + }, + { + "epoch": 3.988817891373802, + "grad_norm": 0.24480414390563965, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4994 + }, + { + "epoch": 3.9896166134185305, + "grad_norm": 0.11939835548400879, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4995 + }, + { + "epoch": 3.9904153354632586, + "grad_norm": 0.0829034298658371, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4996 + }, + { + "epoch": 3.991214057507987, + "grad_norm": 0.1649356484413147, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4997 + }, + { + "epoch": 3.9920127795527156, + "grad_norm": 0.18428824841976166, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4998 + }, + { + "epoch": 3.992811501597444, + "grad_norm": 0.14441022276878357, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4999 + }, + { + "epoch": 3.9936102236421727, + "grad_norm": 0.1025838553905487, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5000 + }, + { + "epoch": 3.994408945686901, + "grad_norm": 0.18659353256225586, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5001 + }, + { + "epoch": 3.9952076677316293, + "grad_norm": 0.18462489545345306, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5002 + }, + { + "epoch": 3.996006389776358, + "grad_norm": 0.11221570521593094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5003 + }, + { + "epoch": 3.9968051118210863, + "grad_norm": 0.1611207127571106, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5004 + }, + { + "epoch": 3.997603833865815, + "grad_norm": 0.10003258287906647, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5005 + }, + { + "epoch": 3.998402555910543, + "grad_norm": 0.06686410307884216, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5006 + }, + { + "epoch": 3.9992012779552715, + "grad_norm": 0.07527180016040802, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5007 + }, + { + "epoch": 4.0, + "grad_norm": 0.11602520197629929, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5008 + }, + { + "epoch": 4.0007987220447285, + "grad_norm": 0.04460546746850014, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5009 + }, + { + "epoch": 4.001597444089457, + "grad_norm": 1.1286108493804932, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5010 + }, + { + "epoch": 4.002396166134186, + "grad_norm": 0.12730571627616882, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5011 + }, + { + "epoch": 4.003194888178914, + "grad_norm": 0.060798924416303635, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5012 + }, + { + "epoch": 4.003993610223642, + "grad_norm": 0.11491188406944275, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5013 + }, + { + "epoch": 4.00479233226837, + "grad_norm": 0.09877663850784302, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5014 + }, + { + "epoch": 4.005591054313099, + "grad_norm": 0.06991511583328247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5015 + }, + { + "epoch": 4.006389776357827, + "grad_norm": 0.05524459481239319, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5016 + }, + { + "epoch": 4.007188498402556, + "grad_norm": 0.07421471178531647, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5017 + }, + { + "epoch": 4.007987220447284, + "grad_norm": 0.10918284207582474, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5018 + }, + { + "epoch": 4.008785942492013, + "grad_norm": 0.42926761507987976, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5019 + }, + { + "epoch": 4.0095846645367414, + "grad_norm": 0.12511351704597473, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5020 + }, + { + "epoch": 4.01038338658147, + "grad_norm": 0.0985826924443245, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5021 + }, + { + "epoch": 4.0111821086261985, + "grad_norm": 0.10876046866178513, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5022 + }, + { + "epoch": 4.011980830670926, + "grad_norm": 0.0973401740193367, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5023 + }, + { + "epoch": 4.012779552715655, + "grad_norm": 0.10867046564817429, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5024 + }, + { + "epoch": 4.013578274760383, + "grad_norm": 0.16030259430408478, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5025 + }, + { + "epoch": 4.014376996805112, + "grad_norm": 0.09972470998764038, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5026 + }, + { + "epoch": 4.01517571884984, + "grad_norm": 0.06945701688528061, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5027 + }, + { + "epoch": 4.015974440894569, + "grad_norm": 0.12256570160388947, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5028 + }, + { + "epoch": 4.016773162939297, + "grad_norm": 0.1318589597940445, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5029 + }, + { + "epoch": 4.017571884984026, + "grad_norm": 0.14831772446632385, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5030 + }, + { + "epoch": 4.018370607028754, + "grad_norm": 0.12650129199028015, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5031 + }, + { + "epoch": 4.019169329073482, + "grad_norm": 0.25457820296287537, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5032 + }, + { + "epoch": 4.0199680511182105, + "grad_norm": 0.10183271020650864, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5033 + }, + { + "epoch": 4.020766773162939, + "grad_norm": 0.14198726415634155, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5034 + }, + { + "epoch": 4.021565495207668, + "grad_norm": 0.1551627218723297, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5035 + }, + { + "epoch": 4.022364217252396, + "grad_norm": 0.29212328791618347, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5036 + }, + { + "epoch": 4.023162939297125, + "grad_norm": 0.25203290581703186, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5037 + }, + { + "epoch": 4.023961661341853, + "grad_norm": 0.12793950736522675, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5038 + }, + { + "epoch": 4.024760383386582, + "grad_norm": 0.10916420817375183, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5039 + }, + { + "epoch": 4.02555910543131, + "grad_norm": 0.09980735182762146, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5040 + }, + { + "epoch": 4.026357827476039, + "grad_norm": 0.1633901745080948, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5041 + }, + { + "epoch": 4.027156549520766, + "grad_norm": 0.10058299452066422, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5042 + }, + { + "epoch": 4.027955271565495, + "grad_norm": 0.08121561259031296, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5043 + }, + { + "epoch": 4.0287539936102235, + "grad_norm": 0.19947005808353424, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5044 + }, + { + "epoch": 4.029552715654952, + "grad_norm": 0.24219068884849548, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5045 + }, + { + "epoch": 4.0303514376996805, + "grad_norm": 0.28928735852241516, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5046 + }, + { + "epoch": 4.031150159744409, + "grad_norm": 0.062404267489910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5047 + }, + { + "epoch": 4.031948881789138, + "grad_norm": 0.1607569456100464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5048 + }, + { + "epoch": 4.032747603833866, + "grad_norm": 0.14420244097709656, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5049 + }, + { + "epoch": 4.033546325878595, + "grad_norm": 0.838013768196106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5050 + }, + { + "epoch": 4.034345047923322, + "grad_norm": 0.15198078751564026, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5051 + }, + { + "epoch": 4.035143769968051, + "grad_norm": 0.18439999222755432, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5052 + }, + { + "epoch": 4.035942492012779, + "grad_norm": 0.1283460259437561, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5053 + }, + { + "epoch": 4.036741214057508, + "grad_norm": 0.07285412400960922, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5054 + }, + { + "epoch": 4.037539936102236, + "grad_norm": 0.21856451034545898, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5055 + }, + { + "epoch": 4.038338658146965, + "grad_norm": 0.1934041529893875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5056 + }, + { + "epoch": 4.039137380191693, + "grad_norm": 0.07998216152191162, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5057 + }, + { + "epoch": 4.039936102236422, + "grad_norm": 0.2202988713979721, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5058 + }, + { + "epoch": 4.0407348242811505, + "grad_norm": 0.22000271081924438, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5059 + }, + { + "epoch": 4.041533546325879, + "grad_norm": 0.06229308247566223, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5060 + }, + { + "epoch": 4.042332268370607, + "grad_norm": 0.19611188769340515, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5061 + }, + { + "epoch": 4.043130990415335, + "grad_norm": 0.2385999858379364, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5062 + }, + { + "epoch": 4.043929712460064, + "grad_norm": 0.06504995375871658, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5063 + }, + { + "epoch": 4.044728434504792, + "grad_norm": 0.17860567569732666, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5064 + }, + { + "epoch": 4.045527156549521, + "grad_norm": 0.17580853402614594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5065 + }, + { + "epoch": 4.046325878594249, + "grad_norm": 0.06523217260837555, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5066 + }, + { + "epoch": 4.047124600638978, + "grad_norm": 0.2795565128326416, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5067 + }, + { + "epoch": 4.047923322683706, + "grad_norm": 0.289105623960495, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5068 + }, + { + "epoch": 4.048722044728435, + "grad_norm": 0.07829197496175766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5069 + }, + { + "epoch": 4.0495207667731625, + "grad_norm": 0.24165435135364532, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5070 + }, + { + "epoch": 4.050319488817891, + "grad_norm": 0.2785094976425171, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5071 + }, + { + "epoch": 4.05111821086262, + "grad_norm": 0.08929550647735596, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5072 + }, + { + "epoch": 4.051916932907348, + "grad_norm": 0.24677781760692596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5073 + }, + { + "epoch": 4.052715654952077, + "grad_norm": 0.25207674503326416, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5074 + }, + { + "epoch": 4.053514376996805, + "grad_norm": 0.06409729272127151, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5075 + }, + { + "epoch": 4.054313099041534, + "grad_norm": 0.2670205235481262, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5076 + }, + { + "epoch": 4.055111821086262, + "grad_norm": 0.1854943484067917, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5077 + }, + { + "epoch": 4.055910543130991, + "grad_norm": 0.1409354954957962, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5078 + }, + { + "epoch": 4.056709265175719, + "grad_norm": 0.24084609746932983, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5079 + }, + { + "epoch": 4.057507987220447, + "grad_norm": 0.16520382463932037, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5080 + }, + { + "epoch": 4.0583067092651754, + "grad_norm": 0.11086967587471008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5081 + }, + { + "epoch": 4.059105431309904, + "grad_norm": 0.15748612582683563, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5082 + }, + { + "epoch": 4.0599041533546325, + "grad_norm": 0.1196034848690033, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5083 + }, + { + "epoch": 4.060702875399361, + "grad_norm": 0.06799823045730591, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5084 + }, + { + "epoch": 4.06150159744409, + "grad_norm": 0.1223025768995285, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5085 + }, + { + "epoch": 4.062300319488818, + "grad_norm": 0.04760991781949997, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5086 + }, + { + "epoch": 4.063099041533547, + "grad_norm": 0.11782078444957733, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5087 + }, + { + "epoch": 4.063897763578275, + "grad_norm": 0.13057227432727814, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5088 + }, + { + "epoch": 4.064696485623003, + "grad_norm": 0.0719611644744873, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5089 + }, + { + "epoch": 4.065495207667731, + "grad_norm": 0.13513247668743134, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5090 + }, + { + "epoch": 4.06629392971246, + "grad_norm": 0.14960692822933197, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5091 + }, + { + "epoch": 4.067092651757188, + "grad_norm": 0.06219497323036194, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5092 + }, + { + "epoch": 4.067891373801917, + "grad_norm": 0.06755383312702179, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5093 + }, + { + "epoch": 4.068690095846645, + "grad_norm": 0.08237830549478531, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5094 + }, + { + "epoch": 4.069488817891374, + "grad_norm": 0.0915946289896965, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5095 + }, + { + "epoch": 4.0702875399361025, + "grad_norm": 0.06893479824066162, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5096 + }, + { + "epoch": 4.071086261980831, + "grad_norm": 0.04133071005344391, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5097 + }, + { + "epoch": 4.0718849840255595, + "grad_norm": 0.062333185225725174, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5098 + }, + { + "epoch": 4.072683706070287, + "grad_norm": 0.05741016939282417, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5099 + }, + { + "epoch": 4.073482428115016, + "grad_norm": 0.04988866671919823, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5100 + }, + { + "epoch": 4.074281150159744, + "grad_norm": 0.050187818706035614, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5101 + }, + { + "epoch": 4.075079872204473, + "grad_norm": 0.08479643613100052, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5102 + }, + { + "epoch": 4.075878594249201, + "grad_norm": 0.13840351998806, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5103 + }, + { + "epoch": 4.07667731629393, + "grad_norm": 0.11400903016328812, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5104 + }, + { + "epoch": 4.077476038338658, + "grad_norm": 0.06956811994314194, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5105 + }, + { + "epoch": 4.078274760383387, + "grad_norm": 0.09173833578824997, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5106 + }, + { + "epoch": 4.079073482428115, + "grad_norm": 0.09024006128311157, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5107 + }, + { + "epoch": 4.079872204472843, + "grad_norm": 0.04257406294345856, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5108 + }, + { + "epoch": 4.080670926517572, + "grad_norm": 0.04252707585692406, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5109 + }, + { + "epoch": 4.0814696485623, + "grad_norm": 0.052367035299539566, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5110 + }, + { + "epoch": 4.082268370607029, + "grad_norm": 0.06344939023256302, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5111 + }, + { + "epoch": 4.083067092651757, + "grad_norm": 0.04674215242266655, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5112 + }, + { + "epoch": 4.083865814696486, + "grad_norm": 0.03664534166455269, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5113 + }, + { + "epoch": 4.084664536741214, + "grad_norm": 0.07198764383792877, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5114 + }, + { + "epoch": 4.085463258785943, + "grad_norm": 0.06294529885053635, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5115 + }, + { + "epoch": 4.086261980830671, + "grad_norm": 0.09595668315887451, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5116 + }, + { + "epoch": 4.0870607028754, + "grad_norm": 0.09830893576145172, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5117 + }, + { + "epoch": 4.087859424920127, + "grad_norm": 0.09647611528635025, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5118 + }, + { + "epoch": 4.088658146964856, + "grad_norm": 0.04558149725198746, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5119 + }, + { + "epoch": 4.0894568690095845, + "grad_norm": 0.11090628057718277, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5120 + }, + { + "epoch": 4.090255591054313, + "grad_norm": 0.1119648665189743, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5121 + }, + { + "epoch": 4.0910543130990416, + "grad_norm": 0.0372939296066761, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5122 + }, + { + "epoch": 4.09185303514377, + "grad_norm": 0.10749047994613647, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5123 + }, + { + "epoch": 4.092651757188499, + "grad_norm": 0.08718341588973999, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5124 + }, + { + "epoch": 4.093450479233227, + "grad_norm": 0.04954478517174721, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5125 + }, + { + "epoch": 4.094249201277956, + "grad_norm": 0.0599503293633461, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5126 + }, + { + "epoch": 4.095047923322683, + "grad_norm": 0.04633599892258644, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5127 + }, + { + "epoch": 4.095846645367412, + "grad_norm": 0.0502074733376503, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5128 + }, + { + "epoch": 4.09664536741214, + "grad_norm": 0.1348472684621811, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5129 + }, + { + "epoch": 4.097444089456869, + "grad_norm": 0.07534858584403992, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5130 + }, + { + "epoch": 4.098242811501597, + "grad_norm": 0.04207107052206993, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5131 + }, + { + "epoch": 4.099041533546326, + "grad_norm": 0.062090687453746796, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5132 + }, + { + "epoch": 4.0998402555910545, + "grad_norm": 0.08783479779958725, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5133 + }, + { + "epoch": 4.100638977635783, + "grad_norm": 0.04489055275917053, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5134 + }, + { + "epoch": 4.1014376996805115, + "grad_norm": 0.07360105961561203, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5135 + }, + { + "epoch": 4.102236421725239, + "grad_norm": 0.10253020375967026, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5136 + }, + { + "epoch": 4.103035143769968, + "grad_norm": 0.12787389755249023, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5137 + }, + { + "epoch": 4.103833865814696, + "grad_norm": 0.43946513533592224, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5138 + }, + { + "epoch": 4.104632587859425, + "grad_norm": 0.7717093825340271, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5139 + }, + { + "epoch": 4.105431309904153, + "grad_norm": 0.1433849334716797, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5140 + }, + { + "epoch": 4.106230031948882, + "grad_norm": 0.09110052138566971, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5141 + }, + { + "epoch": 4.10702875399361, + "grad_norm": 0.13785111904144287, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5142 + }, + { + "epoch": 4.107827476038339, + "grad_norm": 0.0910695344209671, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5143 + }, + { + "epoch": 4.108626198083067, + "grad_norm": 0.10390721261501312, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5144 + }, + { + "epoch": 4.109424920127796, + "grad_norm": 0.07039178162813187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5145 + }, + { + "epoch": 4.110223642172524, + "grad_norm": 0.08536665886640549, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5146 + }, + { + "epoch": 4.111022364217252, + "grad_norm": 0.1355360597372055, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5147 + }, + { + "epoch": 4.111821086261981, + "grad_norm": 0.13981834053993225, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5148 + }, + { + "epoch": 4.112619808306709, + "grad_norm": 0.12653453648090363, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5149 + }, + { + "epoch": 4.113418530351438, + "grad_norm": 0.06805716454982758, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5150 + }, + { + "epoch": 4.114217252396166, + "grad_norm": 0.14361023902893066, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5151 + }, + { + "epoch": 4.115015974440895, + "grad_norm": 0.15223950147628784, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5152 + }, + { + "epoch": 4.115814696485623, + "grad_norm": 0.10013193637132645, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5153 + }, + { + "epoch": 4.116613418530352, + "grad_norm": 0.21049730479717255, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5154 + }, + { + "epoch": 4.11741214057508, + "grad_norm": 0.1393776834011078, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5155 + }, + { + "epoch": 4.118210862619808, + "grad_norm": 0.08584857732057571, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5156 + }, + { + "epoch": 4.1190095846645365, + "grad_norm": 0.06729432195425034, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5157 + }, + { + "epoch": 4.119808306709265, + "grad_norm": 0.08861853927373886, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5158 + }, + { + "epoch": 4.1206070287539935, + "grad_norm": 0.07037574052810669, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5159 + }, + { + "epoch": 4.121405750798722, + "grad_norm": 0.08049193024635315, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5160 + }, + { + "epoch": 4.122204472843451, + "grad_norm": 0.09040962159633636, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5161 + }, + { + "epoch": 4.123003194888179, + "grad_norm": 0.06531825661659241, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5162 + }, + { + "epoch": 4.123801916932908, + "grad_norm": 0.09423618763685226, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5163 + }, + { + "epoch": 4.124600638977636, + "grad_norm": 0.09436366707086563, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5164 + }, + { + "epoch": 4.125399361022364, + "grad_norm": 0.07543698698282242, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5165 + }, + { + "epoch": 4.126198083067092, + "grad_norm": 0.07491134852170944, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 5166 + }, + { + "epoch": 4.126996805111821, + "grad_norm": 0.09040437638759613, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5167 + }, + { + "epoch": 4.127795527156549, + "grad_norm": 0.11145798116922379, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5168 + }, + { + "epoch": 4.128594249201278, + "grad_norm": 0.35186707973480225, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5169 + }, + { + "epoch": 4.1293929712460065, + "grad_norm": 0.08744635432958603, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5170 + }, + { + "epoch": 4.130191693290735, + "grad_norm": 0.1078719049692154, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5171 + }, + { + "epoch": 4.1309904153354635, + "grad_norm": 0.13568760454654694, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5172 + }, + { + "epoch": 4.131789137380192, + "grad_norm": 0.10629335045814514, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5173 + }, + { + "epoch": 4.13258785942492, + "grad_norm": 0.3467697203159332, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5174 + }, + { + "epoch": 4.133386581469648, + "grad_norm": 0.5514479875564575, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5175 + }, + { + "epoch": 4.134185303514377, + "grad_norm": 0.2762874960899353, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5176 + }, + { + "epoch": 4.134984025559105, + "grad_norm": 0.25959524512290955, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5177 + }, + { + "epoch": 4.135782747603834, + "grad_norm": 0.26429036259651184, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5178 + }, + { + "epoch": 4.136581469648562, + "grad_norm": 0.4492235779762268, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5179 + }, + { + "epoch": 4.137380191693291, + "grad_norm": 0.3261977732181549, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 5180 + }, + { + "epoch": 4.138178913738019, + "grad_norm": 0.15618108212947845, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5181 + }, + { + "epoch": 4.138977635782748, + "grad_norm": 0.2897289991378784, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5182 + }, + { + "epoch": 4.139776357827476, + "grad_norm": 0.2599884271621704, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5183 + }, + { + "epoch": 4.140575079872204, + "grad_norm": 0.3158198893070221, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5184 + }, + { + "epoch": 4.141373801916933, + "grad_norm": 0.2701073884963989, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5185 + }, + { + "epoch": 4.142172523961661, + "grad_norm": 0.14668017625808716, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5186 + }, + { + "epoch": 4.14297124600639, + "grad_norm": 0.14284202456474304, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5187 + }, + { + "epoch": 4.143769968051118, + "grad_norm": 0.1901128888130188, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5188 + }, + { + "epoch": 4.144568690095847, + "grad_norm": 0.17808575928211212, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5189 + }, + { + "epoch": 4.145367412140575, + "grad_norm": 0.11329478025436401, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5190 + }, + { + "epoch": 4.146166134185304, + "grad_norm": 0.10816467553377151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5191 + }, + { + "epoch": 4.146964856230032, + "grad_norm": 0.11593834310770035, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5192 + }, + { + "epoch": 4.147763578274761, + "grad_norm": 0.17315705120563507, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5193 + }, + { + "epoch": 4.1485623003194885, + "grad_norm": 0.10884186625480652, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5194 + }, + { + "epoch": 4.149361022364217, + "grad_norm": 0.17528203129768372, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5195 + }, + { + "epoch": 4.1501597444089455, + "grad_norm": 0.3249641954898834, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5196 + }, + { + "epoch": 4.150958466453674, + "grad_norm": 0.2920859456062317, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5197 + }, + { + "epoch": 4.151757188498403, + "grad_norm": 0.12487918138504028, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5198 + }, + { + "epoch": 4.152555910543131, + "grad_norm": 0.07744348049163818, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5199 + }, + { + "epoch": 4.15335463258786, + "grad_norm": 0.11721999943256378, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5200 + }, + { + "epoch": 4.154153354632588, + "grad_norm": 0.17566390335559845, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5201 + }, + { + "epoch": 4.154952076677317, + "grad_norm": 0.09762726724147797, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5202 + }, + { + "epoch": 4.155750798722044, + "grad_norm": 0.10769844055175781, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5203 + }, + { + "epoch": 4.156549520766773, + "grad_norm": 0.1608363389968872, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5204 + }, + { + "epoch": 4.157348242811501, + "grad_norm": 0.1575978696346283, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5205 + }, + { + "epoch": 4.15814696485623, + "grad_norm": 0.2035059779882431, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5206 + }, + { + "epoch": 4.1589456869009584, + "grad_norm": 0.1405210644006729, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5207 + }, + { + "epoch": 4.159744408945687, + "grad_norm": 0.18898408114910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5208 + }, + { + "epoch": 4.1605431309904155, + "grad_norm": 0.20012563467025757, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5209 + }, + { + "epoch": 4.161341853035144, + "grad_norm": 0.14585568010807037, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5210 + }, + { + "epoch": 4.162140575079873, + "grad_norm": 0.166448175907135, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5211 + }, + { + "epoch": 4.1629392971246, + "grad_norm": 0.08768735080957413, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5212 + }, + { + "epoch": 4.163738019169329, + "grad_norm": 0.12429258227348328, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5213 + }, + { + "epoch": 4.164536741214057, + "grad_norm": 0.06750953942537308, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5214 + }, + { + "epoch": 4.165335463258786, + "grad_norm": 0.10137717425823212, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5215 + }, + { + "epoch": 4.166134185303514, + "grad_norm": 0.1015368178486824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5216 + }, + { + "epoch": 4.166932907348243, + "grad_norm": 0.12396319955587387, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5217 + }, + { + "epoch": 4.167731629392971, + "grad_norm": 0.11295704543590546, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5218 + }, + { + "epoch": 4.1685303514377, + "grad_norm": 0.1415906846523285, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5219 + }, + { + "epoch": 4.169329073482428, + "grad_norm": 0.1300252079963684, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5220 + }, + { + "epoch": 4.170127795527157, + "grad_norm": 0.09486760199069977, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5221 + }, + { + "epoch": 4.170926517571885, + "grad_norm": 0.25776198506355286, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5222 + }, + { + "epoch": 4.171725239616613, + "grad_norm": 0.07684944570064545, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5223 + }, + { + "epoch": 4.172523961661342, + "grad_norm": 0.06909538060426712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5224 + }, + { + "epoch": 4.17332268370607, + "grad_norm": 0.09686419367790222, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5225 + }, + { + "epoch": 4.174121405750799, + "grad_norm": 0.10760180652141571, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5226 + }, + { + "epoch": 4.174920127795527, + "grad_norm": 0.0963902473449707, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5227 + }, + { + "epoch": 4.175718849840256, + "grad_norm": 0.12986192107200623, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5228 + }, + { + "epoch": 4.176517571884984, + "grad_norm": 0.12532354891300201, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5229 + }, + { + "epoch": 4.177316293929713, + "grad_norm": 0.158639058470726, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5230 + }, + { + "epoch": 4.178115015974441, + "grad_norm": 0.10025905817747116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5231 + }, + { + "epoch": 4.178913738019169, + "grad_norm": 0.19150952994823456, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5232 + }, + { + "epoch": 4.1797124600638975, + "grad_norm": 0.10650201886892319, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5233 + }, + { + "epoch": 4.180511182108626, + "grad_norm": 0.08948210626840591, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5234 + }, + { + "epoch": 4.181309904153355, + "grad_norm": 0.144260972738266, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5235 + }, + { + "epoch": 4.182108626198083, + "grad_norm": 0.10631201416254044, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5236 + }, + { + "epoch": 4.182907348242812, + "grad_norm": 0.17884188890457153, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5237 + }, + { + "epoch": 4.18370607028754, + "grad_norm": 0.12393054366111755, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5238 + }, + { + "epoch": 4.184504792332269, + "grad_norm": 0.10113117098808289, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5239 + }, + { + "epoch": 4.185303514376997, + "grad_norm": 0.08745535463094711, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5240 + }, + { + "epoch": 4.186102236421725, + "grad_norm": 0.12319829314947128, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5241 + }, + { + "epoch": 4.186900958466453, + "grad_norm": 0.10202868282794952, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5242 + }, + { + "epoch": 4.187699680511182, + "grad_norm": 0.12799306213855743, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5243 + }, + { + "epoch": 4.18849840255591, + "grad_norm": 0.10247227549552917, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5244 + }, + { + "epoch": 4.189297124600639, + "grad_norm": 0.0876200944185257, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5245 + }, + { + "epoch": 4.1900958466453675, + "grad_norm": 0.08829693496227264, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5246 + }, + { + "epoch": 4.190894568690096, + "grad_norm": 0.09005091339349747, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5247 + }, + { + "epoch": 4.1916932907348246, + "grad_norm": 0.06715424358844757, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5248 + }, + { + "epoch": 4.192492012779553, + "grad_norm": 0.11082255840301514, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5249 + }, + { + "epoch": 4.193290734824281, + "grad_norm": 0.08197743445634842, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5250 + }, + { + "epoch": 4.194089456869009, + "grad_norm": 0.08641887456178665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5251 + }, + { + "epoch": 4.194888178913738, + "grad_norm": 0.29264676570892334, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5252 + }, + { + "epoch": 4.195686900958466, + "grad_norm": 0.10122201591730118, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5253 + }, + { + "epoch": 4.196485623003195, + "grad_norm": 0.13220930099487305, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5254 + }, + { + "epoch": 4.197284345047923, + "grad_norm": 0.05919777229428291, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5255 + }, + { + "epoch": 4.198083067092652, + "grad_norm": 0.15947407484054565, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5256 + }, + { + "epoch": 4.19888178913738, + "grad_norm": 0.08046088367700577, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5257 + }, + { + "epoch": 4.199680511182109, + "grad_norm": 0.08504491299390793, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5258 + }, + { + "epoch": 4.2004792332268375, + "grad_norm": 0.2523876428604126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5259 + }, + { + "epoch": 4.201277955271565, + "grad_norm": 0.32436496019363403, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5260 + }, + { + "epoch": 4.202076677316294, + "grad_norm": 0.3832956552505493, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5261 + }, + { + "epoch": 4.202875399361022, + "grad_norm": 0.15481804311275482, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5262 + }, + { + "epoch": 4.203674121405751, + "grad_norm": 0.5061212182044983, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5263 + }, + { + "epoch": 4.204472843450479, + "grad_norm": 0.2778873145580292, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5264 + }, + { + "epoch": 4.205271565495208, + "grad_norm": 0.10782434046268463, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5265 + }, + { + "epoch": 4.206070287539936, + "grad_norm": 0.2730430066585541, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5266 + }, + { + "epoch": 4.206869009584665, + "grad_norm": 0.14902958273887634, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5267 + }, + { + "epoch": 4.207667731629393, + "grad_norm": 0.2455812245607376, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5268 + }, + { + "epoch": 4.208466453674121, + "grad_norm": 0.36285653710365295, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5269 + }, + { + "epoch": 4.2092651757188495, + "grad_norm": 0.16104358434677124, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5270 + }, + { + "epoch": 4.210063897763578, + "grad_norm": 0.10330995172262192, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5271 + }, + { + "epoch": 4.210862619808307, + "grad_norm": 0.14438849687576294, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5272 + }, + { + "epoch": 4.211661341853035, + "grad_norm": 0.11719724535942078, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5273 + }, + { + "epoch": 4.212460063897764, + "grad_norm": 0.13503463566303253, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5274 + }, + { + "epoch": 4.213258785942492, + "grad_norm": 0.12717710435390472, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5275 + }, + { + "epoch": 4.214057507987221, + "grad_norm": 0.12293769419193268, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5276 + }, + { + "epoch": 4.214856230031949, + "grad_norm": 0.11828786134719849, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5277 + }, + { + "epoch": 4.215654952076678, + "grad_norm": 0.11118468642234802, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5278 + }, + { + "epoch": 4.216453674121405, + "grad_norm": 0.15688025951385498, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5279 + }, + { + "epoch": 4.217252396166134, + "grad_norm": 0.10603991895914078, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5280 + }, + { + "epoch": 4.218051118210862, + "grad_norm": 0.14034971594810486, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5281 + }, + { + "epoch": 4.218849840255591, + "grad_norm": 0.21270571649074554, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5282 + }, + { + "epoch": 4.2196485623003195, + "grad_norm": 0.17699144780635834, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5283 + }, + { + "epoch": 4.220447284345048, + "grad_norm": 0.07665220648050308, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5284 + }, + { + "epoch": 4.2212460063897765, + "grad_norm": 0.13917282223701477, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5285 + }, + { + "epoch": 4.222044728434505, + "grad_norm": 0.1253320872783661, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5286 + }, + { + "epoch": 4.222843450479234, + "grad_norm": 0.07693646103143692, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5287 + }, + { + "epoch": 4.223642172523961, + "grad_norm": 0.11877891421318054, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5288 + }, + { + "epoch": 4.22444089456869, + "grad_norm": 0.08900399506092072, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5289 + }, + { + "epoch": 4.225239616613418, + "grad_norm": 0.08575741946697235, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5290 + }, + { + "epoch": 4.226038338658147, + "grad_norm": 0.11078973859548569, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5291 + }, + { + "epoch": 4.226837060702875, + "grad_norm": 0.12371394783258438, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5292 + }, + { + "epoch": 4.227635782747604, + "grad_norm": 0.11741651594638824, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5293 + }, + { + "epoch": 4.228434504792332, + "grad_norm": 0.1316244751214981, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5294 + }, + { + "epoch": 4.229233226837061, + "grad_norm": 0.07751733064651489, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5295 + }, + { + "epoch": 4.2300319488817895, + "grad_norm": 0.13512739539146423, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5296 + }, + { + "epoch": 4.230830670926518, + "grad_norm": 0.14408327639102936, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5297 + }, + { + "epoch": 4.231629392971246, + "grad_norm": 0.05596759170293808, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5298 + }, + { + "epoch": 4.232428115015974, + "grad_norm": 0.20518198609352112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5299 + }, + { + "epoch": 4.233226837060703, + "grad_norm": 0.17000356316566467, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5300 + }, + { + "epoch": 4.234025559105431, + "grad_norm": 0.10213350504636765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5301 + }, + { + "epoch": 4.23482428115016, + "grad_norm": 0.1633368879556656, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 4.235623003194888, + "grad_norm": 0.17330236732959747, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5303 + }, + { + "epoch": 4.236421725239617, + "grad_norm": 0.20028679072856903, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5304 + }, + { + "epoch": 4.237220447284345, + "grad_norm": 0.23386533558368683, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5305 + }, + { + "epoch": 4.238019169329074, + "grad_norm": 0.051739469170570374, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5306 + }, + { + "epoch": 4.2388178913738015, + "grad_norm": 0.19732257723808289, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5307 + }, + { + "epoch": 4.23961661341853, + "grad_norm": 0.1318890005350113, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5308 + }, + { + "epoch": 4.2404153354632586, + "grad_norm": 0.17188113927841187, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5309 + }, + { + "epoch": 4.241214057507987, + "grad_norm": 0.23981456458568573, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5310 + }, + { + "epoch": 4.242012779552716, + "grad_norm": 0.15658913552761078, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5311 + }, + { + "epoch": 4.242811501597444, + "grad_norm": 0.13481132686138153, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5312 + }, + { + "epoch": 4.243610223642173, + "grad_norm": 0.16327355802059174, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5313 + }, + { + "epoch": 4.244408945686901, + "grad_norm": 0.0873674675822258, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5314 + }, + { + "epoch": 4.24520766773163, + "grad_norm": 0.16612505912780762, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5315 + }, + { + "epoch": 4.246006389776358, + "grad_norm": 0.15376444160938263, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5316 + }, + { + "epoch": 4.246805111821086, + "grad_norm": 0.07853512465953827, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5317 + }, + { + "epoch": 4.247603833865814, + "grad_norm": 0.11799992620944977, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5318 + }, + { + "epoch": 4.248402555910543, + "grad_norm": 0.09121575206518173, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 5319 + }, + { + "epoch": 4.2492012779552715, + "grad_norm": 0.09780153632164001, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5320 + }, + { + "epoch": 4.25, + "grad_norm": 0.11387690156698227, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5321 + }, + { + "epoch": 4.2507987220447285, + "grad_norm": 0.08085697889328003, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5322 + }, + { + "epoch": 4.251597444089457, + "grad_norm": 0.09986089169979095, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5323 + }, + { + "epoch": 4.252396166134186, + "grad_norm": 0.07728606462478638, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5324 + }, + { + "epoch": 4.253194888178914, + "grad_norm": 0.07464555650949478, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5325 + }, + { + "epoch": 4.253993610223642, + "grad_norm": 0.05129759758710861, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5326 + }, + { + "epoch": 4.25479233226837, + "grad_norm": 0.060275599360466, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5327 + }, + { + "epoch": 4.255591054313099, + "grad_norm": 0.07773016393184662, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5328 + }, + { + "epoch": 4.256389776357827, + "grad_norm": 0.1046462282538414, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5329 + }, + { + "epoch": 4.257188498402556, + "grad_norm": 0.1184321865439415, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5330 + }, + { + "epoch": 4.257987220447284, + "grad_norm": 0.1419631987810135, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5331 + }, + { + "epoch": 4.258785942492013, + "grad_norm": 0.10022144019603729, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5332 + }, + { + "epoch": 4.2595846645367414, + "grad_norm": 0.075701504945755, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5333 + }, + { + "epoch": 4.26038338658147, + "grad_norm": 0.18145573139190674, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5334 + }, + { + "epoch": 4.261182108626198, + "grad_norm": 0.06092703342437744, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5335 + }, + { + "epoch": 4.261980830670926, + "grad_norm": 0.13196219503879547, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5336 + }, + { + "epoch": 4.262779552715655, + "grad_norm": 0.17139793932437897, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5337 + }, + { + "epoch": 4.263578274760383, + "grad_norm": 0.12072623521089554, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5338 + }, + { + "epoch": 4.264376996805112, + "grad_norm": 0.11874449253082275, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5339 + }, + { + "epoch": 4.26517571884984, + "grad_norm": 0.10718921571969986, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5340 + }, + { + "epoch": 4.265974440894569, + "grad_norm": 0.07337968051433563, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5341 + }, + { + "epoch": 4.266773162939297, + "grad_norm": 0.11872536689043045, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5342 + }, + { + "epoch": 4.267571884984026, + "grad_norm": 0.11199923604726791, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5343 + }, + { + "epoch": 4.268370607028754, + "grad_norm": 0.05864759162068367, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5344 + }, + { + "epoch": 4.269169329073483, + "grad_norm": 0.14757969975471497, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5345 + }, + { + "epoch": 4.2699680511182105, + "grad_norm": 0.12190169841051102, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5346 + }, + { + "epoch": 4.270766773162939, + "grad_norm": 0.0532461479306221, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5347 + }, + { + "epoch": 4.271565495207668, + "grad_norm": 0.10723208636045456, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5348 + }, + { + "epoch": 4.272364217252396, + "grad_norm": 0.07115229964256287, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5349 + }, + { + "epoch": 4.273162939297125, + "grad_norm": 0.07450878620147705, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 5350 + }, + { + "epoch": 4.273961661341853, + "grad_norm": 0.11793115735054016, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5351 + }, + { + "epoch": 4.274760383386582, + "grad_norm": 0.10440219938755035, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5352 + }, + { + "epoch": 4.27555910543131, + "grad_norm": 0.27991926670074463, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5353 + }, + { + "epoch": 4.276357827476039, + "grad_norm": 0.11090446263551712, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5354 + }, + { + "epoch": 4.277156549520766, + "grad_norm": 0.10509627312421799, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5355 + }, + { + "epoch": 4.277955271565495, + "grad_norm": 0.06217970326542854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5356 + }, + { + "epoch": 4.2787539936102235, + "grad_norm": 0.34369224309921265, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5357 + }, + { + "epoch": 4.279552715654952, + "grad_norm": 0.1246214285492897, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5358 + }, + { + "epoch": 4.2803514376996805, + "grad_norm": 0.06331677734851837, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5359 + }, + { + "epoch": 4.281150159744409, + "grad_norm": 0.08274740725755692, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5360 + }, + { + "epoch": 4.281948881789138, + "grad_norm": 0.06133527308702469, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5361 + }, + { + "epoch": 4.282747603833866, + "grad_norm": 0.09867174178361893, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5362 + }, + { + "epoch": 4.283546325878595, + "grad_norm": 0.09370579570531845, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5363 + }, + { + "epoch": 4.284345047923322, + "grad_norm": 0.2549540400505066, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5364 + }, + { + "epoch": 4.285143769968051, + "grad_norm": 0.1900271773338318, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5365 + }, + { + "epoch": 4.285942492012779, + "grad_norm": 0.21450525522232056, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5366 + }, + { + "epoch": 4.286741214057508, + "grad_norm": 0.1381012350320816, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5367 + }, + { + "epoch": 4.287539936102236, + "grad_norm": 0.0813983827829361, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5368 + }, + { + "epoch": 4.288338658146965, + "grad_norm": 0.16513130068778992, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5369 + }, + { + "epoch": 4.289137380191693, + "grad_norm": 0.10825667530298233, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5370 + }, + { + "epoch": 4.289936102236422, + "grad_norm": 0.07226242125034332, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5371 + }, + { + "epoch": 4.2907348242811505, + "grad_norm": 0.1278400719165802, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5372 + }, + { + "epoch": 4.291533546325878, + "grad_norm": 0.11092592030763626, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5373 + }, + { + "epoch": 4.292332268370607, + "grad_norm": 0.08732229471206665, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5374 + }, + { + "epoch": 4.293130990415335, + "grad_norm": 0.2182341367006302, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5375 + }, + { + "epoch": 4.293929712460064, + "grad_norm": 0.10107403993606567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5376 + }, + { + "epoch": 4.294728434504792, + "grad_norm": 0.13586364686489105, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5377 + }, + { + "epoch": 4.295527156549521, + "grad_norm": 0.3685734272003174, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5378 + }, + { + "epoch": 4.296325878594249, + "grad_norm": 0.13060712814331055, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5379 + }, + { + "epoch": 4.297124600638978, + "grad_norm": 0.05988436937332153, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5380 + }, + { + "epoch": 4.297923322683706, + "grad_norm": 0.14392045140266418, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 5381 + }, + { + "epoch": 4.298722044728435, + "grad_norm": 0.25003254413604736, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5382 + }, + { + "epoch": 4.2995207667731625, + "grad_norm": 0.055451687425374985, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5383 + }, + { + "epoch": 4.300319488817891, + "grad_norm": 0.11186914891004562, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5384 + }, + { + "epoch": 4.30111821086262, + "grad_norm": 0.11314704269170761, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5385 + }, + { + "epoch": 4.301916932907348, + "grad_norm": 0.43445560336112976, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5386 + }, + { + "epoch": 4.302715654952077, + "grad_norm": 0.09362242370843887, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5387 + }, + { + "epoch": 4.303514376996805, + "grad_norm": 0.04405852034687996, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5388 + }, + { + "epoch": 4.304313099041534, + "grad_norm": 0.12615318596363068, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5389 + }, + { + "epoch": 4.305111821086262, + "grad_norm": 0.1067153736948967, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5390 + }, + { + "epoch": 4.305910543130991, + "grad_norm": 0.05732683837413788, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5391 + }, + { + "epoch": 4.306709265175719, + "grad_norm": 0.2452571988105774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5392 + }, + { + "epoch": 4.307507987220447, + "grad_norm": 0.11733133345842361, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5393 + }, + { + "epoch": 4.3083067092651754, + "grad_norm": 0.06771894544363022, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5394 + }, + { + "epoch": 4.309105431309904, + "grad_norm": 0.12928563356399536, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5395 + }, + { + "epoch": 4.3099041533546325, + "grad_norm": 0.1777956187725067, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5396 + }, + { + "epoch": 4.310702875399361, + "grad_norm": 0.1281544715166092, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5397 + }, + { + "epoch": 4.31150159744409, + "grad_norm": 0.07120000571012497, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5398 + }, + { + "epoch": 4.312300319488818, + "grad_norm": 0.1270848512649536, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5399 + }, + { + "epoch": 4.313099041533547, + "grad_norm": 0.17685648798942566, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5400 + }, + { + "epoch": 4.313897763578275, + "grad_norm": 0.05070900544524193, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5401 + }, + { + "epoch": 4.314696485623003, + "grad_norm": 0.10543418675661087, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5402 + }, + { + "epoch": 4.315495207667731, + "grad_norm": 0.12336398661136627, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5403 + }, + { + "epoch": 4.31629392971246, + "grad_norm": 0.1583624631166458, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5404 + }, + { + "epoch": 4.317092651757188, + "grad_norm": 0.08186022192239761, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5405 + }, + { + "epoch": 4.317891373801917, + "grad_norm": 0.07562705129384995, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5406 + }, + { + "epoch": 4.318690095846645, + "grad_norm": 0.05275554209947586, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5407 + }, + { + "epoch": 4.319488817891374, + "grad_norm": 0.06432928144931793, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5408 + }, + { + "epoch": 4.3202875399361025, + "grad_norm": 0.08220377564430237, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5409 + }, + { + "epoch": 4.321086261980831, + "grad_norm": 0.07882758229970932, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5410 + }, + { + "epoch": 4.321884984025559, + "grad_norm": 0.138245090842247, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5411 + }, + { + "epoch": 4.322683706070287, + "grad_norm": 0.1127534806728363, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5412 + }, + { + "epoch": 4.323482428115016, + "grad_norm": 0.1985669732093811, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5413 + }, + { + "epoch": 4.324281150159744, + "grad_norm": 0.08023711293935776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5414 + }, + { + "epoch": 4.325079872204473, + "grad_norm": 0.13853015005588531, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5415 + }, + { + "epoch": 4.325878594249201, + "grad_norm": 0.18319782614707947, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5416 + }, + { + "epoch": 4.32667731629393, + "grad_norm": 0.073015958070755, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5417 + }, + { + "epoch": 4.327476038338658, + "grad_norm": 0.10771846771240234, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5418 + }, + { + "epoch": 4.328274760383387, + "grad_norm": 0.09512028843164444, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5419 + }, + { + "epoch": 4.329073482428115, + "grad_norm": 0.0822201818227768, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5420 + }, + { + "epoch": 4.329872204472843, + "grad_norm": 0.11839213222265244, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5421 + }, + { + "epoch": 4.330670926517572, + "grad_norm": 0.10274796187877655, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5422 + }, + { + "epoch": 4.3314696485623, + "grad_norm": 0.05896717682480812, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5423 + }, + { + "epoch": 4.332268370607029, + "grad_norm": 0.1268780380487442, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5424 + }, + { + "epoch": 4.333067092651757, + "grad_norm": 0.09173188358545303, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5425 + }, + { + "epoch": 4.333865814696486, + "grad_norm": 0.05155360326170921, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5426 + }, + { + "epoch": 4.334664536741214, + "grad_norm": 0.08836793899536133, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 4.335463258785943, + "grad_norm": 0.08620470017194748, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5428 + }, + { + "epoch": 4.336261980830671, + "grad_norm": 0.06972123682498932, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5429 + }, + { + "epoch": 4.3370607028754, + "grad_norm": 0.12461638450622559, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5430 + }, + { + "epoch": 4.337859424920127, + "grad_norm": 0.08546463400125504, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5431 + }, + { + "epoch": 4.338658146964856, + "grad_norm": 0.08495177328586578, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5432 + }, + { + "epoch": 4.3394568690095845, + "grad_norm": 0.13017377257347107, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5433 + }, + { + "epoch": 4.340255591054313, + "grad_norm": 0.13619504868984222, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5434 + }, + { + "epoch": 4.3410543130990416, + "grad_norm": 0.5835675597190857, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5435 + }, + { + "epoch": 4.34185303514377, + "grad_norm": 0.09355206042528152, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5436 + }, + { + "epoch": 4.342651757188499, + "grad_norm": 0.08626751601696014, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5437 + }, + { + "epoch": 4.343450479233227, + "grad_norm": 0.05652647092938423, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5438 + }, + { + "epoch": 4.344249201277956, + "grad_norm": 0.05232316255569458, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5439 + }, + { + "epoch": 4.345047923322683, + "grad_norm": 0.08115233480930328, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5440 + }, + { + "epoch": 4.345846645367412, + "grad_norm": 0.08757120370864868, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5441 + }, + { + "epoch": 4.34664536741214, + "grad_norm": 0.046224139630794525, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5442 + }, + { + "epoch": 4.347444089456869, + "grad_norm": 0.07967934757471085, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5443 + }, + { + "epoch": 4.348242811501597, + "grad_norm": 0.044298652559518814, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5444 + }, + { + "epoch": 4.349041533546326, + "grad_norm": 0.09021158516407013, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5445 + }, + { + "epoch": 4.3498402555910545, + "grad_norm": 0.12857890129089355, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5446 + }, + { + "epoch": 4.350638977635783, + "grad_norm": 0.05655589699745178, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5447 + }, + { + "epoch": 4.3514376996805115, + "grad_norm": 0.09304624050855637, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5448 + }, + { + "epoch": 4.352236421725239, + "grad_norm": 0.19815632700920105, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5449 + }, + { + "epoch": 4.353035143769968, + "grad_norm": 0.0526299886405468, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5450 + }, + { + "epoch": 4.353833865814696, + "grad_norm": 0.06432242691516876, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5451 + }, + { + "epoch": 4.354632587859425, + "grad_norm": 0.07848794758319855, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5452 + }, + { + "epoch": 4.355431309904153, + "grad_norm": 0.08260536193847656, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5453 + }, + { + "epoch": 4.356230031948882, + "grad_norm": 0.052810169756412506, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5454 + }, + { + "epoch": 4.35702875399361, + "grad_norm": 0.06942226737737656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5455 + }, + { + "epoch": 4.357827476038339, + "grad_norm": 0.13892871141433716, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5456 + }, + { + "epoch": 4.358626198083067, + "grad_norm": 0.15982909500598907, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5457 + }, + { + "epoch": 4.359424920127796, + "grad_norm": 0.08206653594970703, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5458 + }, + { + "epoch": 4.360223642172524, + "grad_norm": 0.08957790583372116, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5459 + }, + { + "epoch": 4.361022364217252, + "grad_norm": 0.03882770985364914, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5460 + }, + { + "epoch": 4.361821086261981, + "grad_norm": 0.0928555279970169, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5461 + }, + { + "epoch": 4.362619808306709, + "grad_norm": 0.057321447879076004, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5462 + }, + { + "epoch": 4.363418530351438, + "grad_norm": 0.0737103596329689, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5463 + }, + { + "epoch": 4.364217252396166, + "grad_norm": 0.06696293503046036, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5464 + }, + { + "epoch": 4.365015974440895, + "grad_norm": 0.04572489857673645, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5465 + }, + { + "epoch": 4.365814696485623, + "grad_norm": 0.094516322016716, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5466 + }, + { + "epoch": 4.366613418530352, + "grad_norm": 0.045576825737953186, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5467 + }, + { + "epoch": 4.36741214057508, + "grad_norm": 0.06839725375175476, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5468 + }, + { + "epoch": 4.368210862619808, + "grad_norm": 0.14465193450450897, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5469 + }, + { + "epoch": 4.3690095846645365, + "grad_norm": 0.07930073887109756, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5470 + }, + { + "epoch": 4.369808306709265, + "grad_norm": 0.06120619550347328, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5471 + }, + { + "epoch": 4.3706070287539935, + "grad_norm": 0.066256083548069, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5472 + }, + { + "epoch": 4.371405750798722, + "grad_norm": 0.11696353554725647, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5473 + }, + { + "epoch": 4.372204472843451, + "grad_norm": 0.11530395597219467, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5474 + }, + { + "epoch": 4.373003194888179, + "grad_norm": 0.05663579702377319, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5475 + }, + { + "epoch": 4.373801916932908, + "grad_norm": 0.1241946592926979, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5476 + }, + { + "epoch": 4.374600638977636, + "grad_norm": 0.1725323498249054, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5477 + }, + { + "epoch": 4.375399361022364, + "grad_norm": 0.09785371273756027, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5478 + }, + { + "epoch": 4.376198083067092, + "grad_norm": 0.0813792496919632, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5479 + }, + { + "epoch": 4.376996805111821, + "grad_norm": 0.17471592128276825, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5480 + }, + { + "epoch": 4.377795527156549, + "grad_norm": 0.1923220455646515, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5481 + }, + { + "epoch": 4.378594249201278, + "grad_norm": 0.09857932478189468, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5482 + }, + { + "epoch": 4.3793929712460065, + "grad_norm": 0.10073419660329819, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5483 + }, + { + "epoch": 4.380191693290735, + "grad_norm": 0.35731273889541626, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5484 + }, + { + "epoch": 4.3809904153354635, + "grad_norm": 0.12060656398534775, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5485 + }, + { + "epoch": 4.381789137380192, + "grad_norm": 0.10264381766319275, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5486 + }, + { + "epoch": 4.38258785942492, + "grad_norm": 0.0868317037820816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5487 + }, + { + "epoch": 4.383386581469648, + "grad_norm": 0.07722344994544983, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5488 + }, + { + "epoch": 4.384185303514377, + "grad_norm": 0.3690173327922821, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5489 + }, + { + "epoch": 4.384984025559105, + "grad_norm": 0.18400169909000397, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5490 + }, + { + "epoch": 4.385782747603834, + "grad_norm": 0.14671844244003296, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5491 + }, + { + "epoch": 4.386581469648562, + "grad_norm": 0.05277179554104805, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5492 + }, + { + "epoch": 4.387380191693291, + "grad_norm": 0.13593660295009613, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5493 + }, + { + "epoch": 4.388178913738019, + "grad_norm": 0.1318334937095642, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5494 + }, + { + "epoch": 4.388977635782748, + "grad_norm": 0.07189908623695374, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5495 + }, + { + "epoch": 4.389776357827476, + "grad_norm": 0.07969736307859421, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5496 + }, + { + "epoch": 4.390575079872204, + "grad_norm": 0.07449150085449219, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5497 + }, + { + "epoch": 4.391373801916933, + "grad_norm": 0.533295214176178, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5498 + }, + { + "epoch": 4.392172523961661, + "grad_norm": 0.10412111133337021, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5499 + }, + { + "epoch": 4.39297124600639, + "grad_norm": 0.08482066541910172, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5500 + }, + { + "epoch": 4.393769968051118, + "grad_norm": 0.08023949712514877, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5501 + }, + { + "epoch": 4.394568690095847, + "grad_norm": 0.16967490315437317, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5502 + }, + { + "epoch": 4.395367412140575, + "grad_norm": 0.1979716271162033, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5503 + }, + { + "epoch": 4.396166134185304, + "grad_norm": 0.09058263152837753, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5504 + }, + { + "epoch": 4.396964856230032, + "grad_norm": 0.13149574398994446, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5505 + }, + { + "epoch": 4.397763578274761, + "grad_norm": 0.08240146189928055, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5506 + }, + { + "epoch": 4.3985623003194885, + "grad_norm": 0.13789936900138855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5507 + }, + { + "epoch": 4.399361022364217, + "grad_norm": 0.18576087057590485, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5508 + }, + { + "epoch": 4.4001597444089455, + "grad_norm": 0.13780297338962555, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5509 + }, + { + "epoch": 4.400958466453674, + "grad_norm": 0.14724896848201752, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5510 + }, + { + "epoch": 4.401757188498403, + "grad_norm": 0.20418551564216614, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5511 + }, + { + "epoch": 4.402555910543131, + "grad_norm": 0.1841040551662445, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5512 + }, + { + "epoch": 4.40335463258786, + "grad_norm": 0.6994684338569641, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5513 + }, + { + "epoch": 4.404153354632588, + "grad_norm": 0.18882393836975098, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5514 + }, + { + "epoch": 4.404952076677317, + "grad_norm": 0.07170864939689636, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5515 + }, + { + "epoch": 4.405750798722044, + "grad_norm": 0.04765893518924713, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5516 + }, + { + "epoch": 4.406549520766773, + "grad_norm": 0.07294443249702454, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5517 + }, + { + "epoch": 4.407348242811501, + "grad_norm": 0.18566831946372986, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 5518 + }, + { + "epoch": 4.40814696485623, + "grad_norm": 0.10881441831588745, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5519 + }, + { + "epoch": 4.4089456869009584, + "grad_norm": 0.380438894033432, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5520 + }, + { + "epoch": 4.409744408945687, + "grad_norm": 0.19281962513923645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5521 + }, + { + "epoch": 4.4105431309904155, + "grad_norm": 0.05730361491441727, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5522 + }, + { + "epoch": 4.411341853035144, + "grad_norm": 0.09276643395423889, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5523 + }, + { + "epoch": 4.412140575079873, + "grad_norm": 0.070807084441185, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5524 + }, + { + "epoch": 4.4129392971246, + "grad_norm": 0.08902080357074738, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5525 + }, + { + "epoch": 4.413738019169329, + "grad_norm": 0.14861932396888733, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5526 + }, + { + "epoch": 4.414536741214057, + "grad_norm": 0.2678995728492737, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5527 + }, + { + "epoch": 4.415335463258786, + "grad_norm": 0.12902382016181946, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5528 + }, + { + "epoch": 4.416134185303514, + "grad_norm": 0.14999063313007355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5529 + }, + { + "epoch": 4.416932907348243, + "grad_norm": 0.13950730860233307, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5530 + }, + { + "epoch": 4.417731629392971, + "grad_norm": 0.12215374410152435, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5531 + }, + { + "epoch": 4.4185303514377, + "grad_norm": 0.12941284477710724, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5532 + }, + { + "epoch": 4.419329073482428, + "grad_norm": 0.22524291276931763, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5533 + }, + { + "epoch": 4.420127795527157, + "grad_norm": 0.0830528736114502, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5534 + }, + { + "epoch": 4.420926517571885, + "grad_norm": 0.1562981903553009, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5535 + }, + { + "epoch": 4.421725239616613, + "grad_norm": 0.19052654504776, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5536 + }, + { + "epoch": 4.422523961661342, + "grad_norm": 0.12264347821474075, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5537 + }, + { + "epoch": 4.42332268370607, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5538 + }, + { + "epoch": 4.424121405750799, + "grad_norm": 0.1412813812494278, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5539 + }, + { + "epoch": 4.424920127795527, + "grad_norm": 0.17808450758457184, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5540 + }, + { + "epoch": 4.425718849840256, + "grad_norm": 0.43806061148643494, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5541 + }, + { + "epoch": 4.426517571884984, + "grad_norm": 0.17728228867053986, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5542 + }, + { + "epoch": 4.427316293929713, + "grad_norm": 0.12434227764606476, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5543 + }, + { + "epoch": 4.428115015974441, + "grad_norm": 0.10051420331001282, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5544 + }, + { + "epoch": 4.428913738019169, + "grad_norm": 0.0943203940987587, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5545 + }, + { + "epoch": 4.4297124600638975, + "grad_norm": 0.08082996308803558, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5546 + }, + { + "epoch": 4.430511182108626, + "grad_norm": 0.13405202329158783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5547 + }, + { + "epoch": 4.431309904153355, + "grad_norm": 0.10448389500379562, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5548 + }, + { + "epoch": 4.432108626198083, + "grad_norm": 0.32405009865760803, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5549 + }, + { + "epoch": 4.432907348242812, + "grad_norm": 0.09690065681934357, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5550 + }, + { + "epoch": 4.43370607028754, + "grad_norm": 0.35410076379776, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5551 + }, + { + "epoch": 4.434504792332269, + "grad_norm": 0.17826306819915771, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5552 + }, + { + "epoch": 4.435303514376997, + "grad_norm": 0.2252579778432846, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5553 + }, + { + "epoch": 4.436102236421725, + "grad_norm": 0.09508918970823288, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5554 + }, + { + "epoch": 4.436900958466453, + "grad_norm": 0.16872358322143555, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5555 + }, + { + "epoch": 4.437699680511182, + "grad_norm": 0.24836355447769165, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5556 + }, + { + "epoch": 4.43849840255591, + "grad_norm": 0.20887835323810577, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5557 + }, + { + "epoch": 4.439297124600639, + "grad_norm": 0.10922685265541077, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5558 + }, + { + "epoch": 4.4400958466453675, + "grad_norm": 0.44561028480529785, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5559 + }, + { + "epoch": 4.440894568690096, + "grad_norm": 0.18160179257392883, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5560 + }, + { + "epoch": 4.4416932907348246, + "grad_norm": 0.06924877315759659, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5561 + }, + { + "epoch": 4.442492012779553, + "grad_norm": 0.15605933964252472, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5562 + }, + { + "epoch": 4.443290734824281, + "grad_norm": 0.10880772024393082, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5563 + }, + { + "epoch": 4.444089456869009, + "grad_norm": 0.1252668797969818, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5564 + }, + { + "epoch": 4.444888178913738, + "grad_norm": 0.20452634990215302, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5565 + }, + { + "epoch": 4.445686900958466, + "grad_norm": 0.20973001420497894, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5566 + }, + { + "epoch": 4.446485623003195, + "grad_norm": 0.07631060481071472, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5567 + }, + { + "epoch": 4.447284345047923, + "grad_norm": 0.14793622493743896, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5568 + }, + { + "epoch": 4.448083067092652, + "grad_norm": 0.30125850439071655, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5569 + }, + { + "epoch": 4.44888178913738, + "grad_norm": 0.1291274130344391, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5570 + }, + { + "epoch": 4.449680511182109, + "grad_norm": 0.08679793030023575, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5571 + }, + { + "epoch": 4.4504792332268375, + "grad_norm": 0.11555953323841095, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5572 + }, + { + "epoch": 4.451277955271565, + "grad_norm": 0.10711846500635147, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5573 + }, + { + "epoch": 4.452076677316294, + "grad_norm": 0.0604897104203701, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5574 + }, + { + "epoch": 4.452875399361022, + "grad_norm": 0.08729933202266693, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5575 + }, + { + "epoch": 4.453674121405751, + "grad_norm": 0.09586715698242188, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5576 + }, + { + "epoch": 4.454472843450479, + "grad_norm": 0.11635993421077728, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5577 + }, + { + "epoch": 4.455271565495208, + "grad_norm": 0.12405801564455032, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5578 + }, + { + "epoch": 4.456070287539936, + "grad_norm": 0.1284986287355423, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5579 + }, + { + "epoch": 4.456869009584665, + "grad_norm": 0.09059973061084747, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5580 + }, + { + "epoch": 4.457667731629393, + "grad_norm": 0.08497101068496704, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5581 + }, + { + "epoch": 4.458466453674122, + "grad_norm": 0.10315481573343277, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5582 + }, + { + "epoch": 4.4592651757188495, + "grad_norm": 0.09923984855413437, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5583 + }, + { + "epoch": 4.460063897763578, + "grad_norm": 0.09179794788360596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5584 + }, + { + "epoch": 4.460862619808307, + "grad_norm": 0.0783005952835083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5585 + }, + { + "epoch": 4.461661341853035, + "grad_norm": 0.4005993604660034, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5586 + }, + { + "epoch": 4.462460063897764, + "grad_norm": 0.09382215887308121, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5587 + }, + { + "epoch": 4.463258785942492, + "grad_norm": 0.10208452492952347, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5588 + }, + { + "epoch": 4.464057507987221, + "grad_norm": 0.08237040042877197, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5589 + }, + { + "epoch": 4.464856230031949, + "grad_norm": 0.07287969440221786, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 5590 + }, + { + "epoch": 4.465654952076678, + "grad_norm": 0.07156763970851898, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5591 + }, + { + "epoch": 4.466453674121405, + "grad_norm": 0.11347219347953796, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5592 + }, + { + "epoch": 4.467252396166134, + "grad_norm": 0.13722039759159088, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5593 + }, + { + "epoch": 4.468051118210862, + "grad_norm": 0.20186153054237366, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5594 + }, + { + "epoch": 4.468849840255591, + "grad_norm": 0.1548159420490265, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5595 + }, + { + "epoch": 4.4696485623003195, + "grad_norm": 0.08960088342428207, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5596 + }, + { + "epoch": 4.470447284345048, + "grad_norm": 0.23552097380161285, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5597 + }, + { + "epoch": 4.4712460063897765, + "grad_norm": 0.34478914737701416, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5598 + }, + { + "epoch": 4.472044728434505, + "grad_norm": 0.219953253865242, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5599 + }, + { + "epoch": 4.472843450479234, + "grad_norm": 0.13104191422462463, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5600 + }, + { + "epoch": 4.473642172523961, + "grad_norm": 0.2867056131362915, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5601 + }, + { + "epoch": 4.47444089456869, + "grad_norm": 0.15794725716114044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5602 + }, + { + "epoch": 4.475239616613418, + "grad_norm": 0.10884165018796921, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5603 + }, + { + "epoch": 4.476038338658147, + "grad_norm": 1.0521267652511597, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5604 + }, + { + "epoch": 4.476837060702875, + "grad_norm": 0.07823536545038223, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5605 + }, + { + "epoch": 4.477635782747604, + "grad_norm": 0.1536101996898651, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5606 + }, + { + "epoch": 4.478434504792332, + "grad_norm": 0.1379251778125763, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5607 + }, + { + "epoch": 4.479233226837061, + "grad_norm": 0.06181122735142708, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5608 + }, + { + "epoch": 4.4800319488817895, + "grad_norm": 0.1701904535293579, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5609 + }, + { + "epoch": 4.480830670926517, + "grad_norm": 0.1322227120399475, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5610 + }, + { + "epoch": 4.481629392971246, + "grad_norm": 0.09158491343259811, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5611 + }, + { + "epoch": 4.482428115015974, + "grad_norm": 0.09851136803627014, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5612 + }, + { + "epoch": 4.483226837060703, + "grad_norm": 0.09350419789552689, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5613 + }, + { + "epoch": 4.484025559105431, + "grad_norm": 0.40614885091781616, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5614 + }, + { + "epoch": 4.48482428115016, + "grad_norm": 0.1653166264295578, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5615 + }, + { + "epoch": 4.485623003194888, + "grad_norm": 0.13429352641105652, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5616 + }, + { + "epoch": 4.486421725239617, + "grad_norm": 0.09340473264455795, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5617 + }, + { + "epoch": 4.487220447284345, + "grad_norm": 0.1621188223361969, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5618 + }, + { + "epoch": 4.488019169329074, + "grad_norm": 0.18538816273212433, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5619 + }, + { + "epoch": 4.488817891373802, + "grad_norm": 0.26981350779533386, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5620 + }, + { + "epoch": 4.48961661341853, + "grad_norm": 0.28865110874176025, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5621 + }, + { + "epoch": 4.4904153354632586, + "grad_norm": 0.23013874888420105, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5622 + }, + { + "epoch": 4.491214057507987, + "grad_norm": 0.08305853605270386, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5623 + }, + { + "epoch": 4.492012779552716, + "grad_norm": 0.1810445487499237, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5624 + }, + { + "epoch": 4.492811501597444, + "grad_norm": 0.23000332713127136, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5625 + }, + { + "epoch": 4.493610223642173, + "grad_norm": 0.06753652542829514, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5626 + }, + { + "epoch": 4.494408945686901, + "grad_norm": 0.19956068694591522, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5627 + }, + { + "epoch": 4.49520766773163, + "grad_norm": 0.24572248756885529, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5628 + }, + { + "epoch": 4.496006389776358, + "grad_norm": 0.06617605686187744, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5629 + }, + { + "epoch": 4.496805111821086, + "grad_norm": 0.18551495671272278, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.497603833865814, + "grad_norm": 0.16827648878097534, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5631 + }, + { + "epoch": 4.498402555910543, + "grad_norm": 0.13273993134498596, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5632 + }, + { + "epoch": 4.4992012779552715, + "grad_norm": 0.24461479485034943, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5633 + }, + { + "epoch": 4.5, + "grad_norm": 0.2016836553812027, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5634 + }, + { + "epoch": 4.5007987220447285, + "grad_norm": 0.07513006776571274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5635 + }, + { + "epoch": 4.501597444089457, + "grad_norm": 0.1701919138431549, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5636 + }, + { + "epoch": 4.502396166134186, + "grad_norm": 0.12785466015338898, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5637 + }, + { + "epoch": 4.503194888178914, + "grad_norm": 0.1135641485452652, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5638 + }, + { + "epoch": 4.503993610223642, + "grad_norm": 0.5004979372024536, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5639 + }, + { + "epoch": 4.50479233226837, + "grad_norm": 0.28730812668800354, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5640 + }, + { + "epoch": 4.505591054313099, + "grad_norm": 0.3666481673717499, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5641 + }, + { + "epoch": 4.506389776357827, + "grad_norm": 0.257710337638855, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5642 + }, + { + "epoch": 4.507188498402556, + "grad_norm": 0.20071941614151, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5643 + }, + { + "epoch": 4.507987220447284, + "grad_norm": 0.3445729613304138, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5644 + }, + { + "epoch": 4.508785942492013, + "grad_norm": 0.20297282934188843, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5645 + }, + { + "epoch": 4.5095846645367414, + "grad_norm": 0.1889636069536209, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5646 + }, + { + "epoch": 4.51038338658147, + "grad_norm": 0.2153794765472412, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5647 + }, + { + "epoch": 4.511182108626198, + "grad_norm": 0.15353621542453766, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5648 + }, + { + "epoch": 4.511980830670926, + "grad_norm": 0.1575399786233902, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5649 + }, + { + "epoch": 4.512779552715655, + "grad_norm": 0.5555608868598938, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5650 + }, + { + "epoch": 4.513578274760383, + "grad_norm": 0.26887524127960205, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5651 + }, + { + "epoch": 4.514376996805112, + "grad_norm": 0.11516866087913513, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5652 + }, + { + "epoch": 4.51517571884984, + "grad_norm": 0.19820965826511383, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5653 + }, + { + "epoch": 4.515974440894569, + "grad_norm": 0.2122081071138382, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5654 + }, + { + "epoch": 4.516773162939297, + "grad_norm": 0.10736703872680664, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5655 + }, + { + "epoch": 4.517571884984026, + "grad_norm": 0.09852312505245209, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5656 + }, + { + "epoch": 4.518370607028754, + "grad_norm": 0.07539162784814835, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5657 + }, + { + "epoch": 4.519169329073483, + "grad_norm": 0.07467353343963623, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5658 + }, + { + "epoch": 4.5199680511182105, + "grad_norm": 0.09987884759902954, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5659 + }, + { + "epoch": 4.520766773162939, + "grad_norm": 0.08720221370458603, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5660 + }, + { + "epoch": 4.521565495207668, + "grad_norm": 0.07798969000577927, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5661 + }, + { + "epoch": 4.522364217252396, + "grad_norm": 0.12410122901201248, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5662 + }, + { + "epoch": 4.523162939297125, + "grad_norm": 0.07746852934360504, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5663 + }, + { + "epoch": 4.523961661341853, + "grad_norm": 0.09171058982610703, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5664 + }, + { + "epoch": 4.524760383386582, + "grad_norm": 0.8176944255828857, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5665 + }, + { + "epoch": 4.52555910543131, + "grad_norm": 0.4282614290714264, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5666 + }, + { + "epoch": 4.526357827476039, + "grad_norm": 0.35193827748298645, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5667 + }, + { + "epoch": 4.527156549520766, + "grad_norm": 0.15641339123249054, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5668 + }, + { + "epoch": 4.527955271565495, + "grad_norm": 0.31442952156066895, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5669 + }, + { + "epoch": 4.5287539936102235, + "grad_norm": 0.3205500841140747, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5670 + }, + { + "epoch": 4.529552715654952, + "grad_norm": 0.2866390645503998, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5671 + }, + { + "epoch": 4.5303514376996805, + "grad_norm": 0.21028868854045868, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5672 + }, + { + "epoch": 4.531150159744409, + "grad_norm": 0.32687097787857056, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5673 + }, + { + "epoch": 4.531948881789138, + "grad_norm": 0.25662627816200256, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5674 + }, + { + "epoch": 4.532747603833866, + "grad_norm": 0.10192561894655228, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5675 + }, + { + "epoch": 4.533546325878595, + "grad_norm": 0.8102573752403259, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5676 + }, + { + "epoch": 4.534345047923322, + "grad_norm": 0.19127781689167023, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5677 + }, + { + "epoch": 4.535143769968051, + "grad_norm": 0.22435548901557922, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5678 + }, + { + "epoch": 4.535942492012779, + "grad_norm": 0.3271692395210266, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5679 + }, + { + "epoch": 4.536741214057508, + "grad_norm": 0.17226184904575348, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5680 + }, + { + "epoch": 4.537539936102236, + "grad_norm": 0.16628077626228333, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5681 + }, + { + "epoch": 4.538338658146965, + "grad_norm": 0.6196639537811279, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5682 + }, + { + "epoch": 4.539137380191693, + "grad_norm": 0.21590936183929443, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5683 + }, + { + "epoch": 4.539936102236422, + "grad_norm": 0.16313950717449188, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5684 + }, + { + "epoch": 4.5407348242811505, + "grad_norm": 0.12859022617340088, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5685 + }, + { + "epoch": 4.541533546325878, + "grad_norm": 0.1189458817243576, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5686 + }, + { + "epoch": 4.542332268370607, + "grad_norm": 6.769774913787842, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5687 + }, + { + "epoch": 4.543130990415335, + "grad_norm": 0.20253166556358337, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5688 + }, + { + "epoch": 4.543929712460064, + "grad_norm": 0.11631135642528534, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5689 + }, + { + "epoch": 4.544728434504792, + "grad_norm": 0.1848360300064087, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5690 + }, + { + "epoch": 4.545527156549521, + "grad_norm": 0.17804184556007385, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 5691 + }, + { + "epoch": 4.546325878594249, + "grad_norm": 0.2214183509349823, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5692 + }, + { + "epoch": 4.547124600638978, + "grad_norm": 16.448396682739258, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5693 + }, + { + "epoch": 4.547923322683706, + "grad_norm": 0.4933917224407196, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 5694 + }, + { + "epoch": 4.548722044728435, + "grad_norm": 0.41254448890686035, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 5695 + }, + { + "epoch": 4.549520766773163, + "grad_norm": 0.28898510336875916, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 5696 + }, + { + "epoch": 4.550319488817891, + "grad_norm": 0.2938457727432251, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5697 + }, + { + "epoch": 4.55111821086262, + "grad_norm": 0.2264672964811325, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5698 + }, + { + "epoch": 4.551916932907348, + "grad_norm": 0.12931588292121887, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5699 + }, + { + "epoch": 4.552715654952077, + "grad_norm": 0.22106601297855377, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5700 + }, + { + "epoch": 4.553514376996805, + "grad_norm": 0.31875962018966675, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5701 + }, + { + "epoch": 4.554313099041534, + "grad_norm": 0.3129211962223053, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5702 + }, + { + "epoch": 4.555111821086262, + "grad_norm": 0.1613578200340271, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5703 + }, + { + "epoch": 4.555910543130991, + "grad_norm": 0.6340786814689636, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 5704 + }, + { + "epoch": 4.556709265175719, + "grad_norm": 0.13203595578670502, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5705 + }, + { + "epoch": 4.557507987220447, + "grad_norm": 0.16561077535152435, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5706 + }, + { + "epoch": 4.5583067092651754, + "grad_norm": 0.17777414619922638, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 5707 + }, + { + "epoch": 4.559105431309904, + "grad_norm": 0.6985258460044861, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 5708 + }, + { + "epoch": 4.5599041533546325, + "grad_norm": 0.18673790991306305, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5709 + }, + { + "epoch": 4.560702875399361, + "grad_norm": 0.10636870563030243, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5710 + }, + { + "epoch": 4.56150159744409, + "grad_norm": 0.1719052493572235, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5711 + }, + { + "epoch": 4.562300319488818, + "grad_norm": 0.7030455470085144, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5712 + }, + { + "epoch": 4.563099041533547, + "grad_norm": 0.1482628434896469, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 5713 + }, + { + "epoch": 4.563897763578275, + "grad_norm": 0.1585852950811386, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5714 + }, + { + "epoch": 4.564696485623003, + "grad_norm": 0.16067056357860565, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5715 + }, + { + "epoch": 4.565495207667731, + "grad_norm": 0.16162389516830444, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5716 + }, + { + "epoch": 4.56629392971246, + "grad_norm": 0.07224202156066895, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5717 + }, + { + "epoch": 4.567092651757188, + "grad_norm": 0.2577751576900482, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5718 + }, + { + "epoch": 4.567891373801917, + "grad_norm": 1.676942229270935, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5719 + }, + { + "epoch": 4.568690095846645, + "grad_norm": 0.11058419197797775, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5720 + }, + { + "epoch": 4.569488817891374, + "grad_norm": 0.23155376315116882, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5721 + }, + { + "epoch": 4.5702875399361025, + "grad_norm": 0.1197747215628624, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5722 + }, + { + "epoch": 4.571086261980831, + "grad_norm": 0.5179840326309204, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5723 + }, + { + "epoch": 4.571884984025559, + "grad_norm": 0.17717961966991425, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5724 + }, + { + "epoch": 4.572683706070287, + "grad_norm": 0.1513422429561615, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5725 + }, + { + "epoch": 4.573482428115016, + "grad_norm": 0.15495018661022186, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5726 + }, + { + "epoch": 4.574281150159744, + "grad_norm": 3.4248743057250977, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5727 + }, + { + "epoch": 4.575079872204473, + "grad_norm": 0.29529228806495667, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5728 + }, + { + "epoch": 4.575878594249201, + "grad_norm": 0.21125876903533936, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 5729 + }, + { + "epoch": 4.57667731629393, + "grad_norm": 0.16381484270095825, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5730 + }, + { + "epoch": 4.577476038338658, + "grad_norm": 0.2144167572259903, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 5731 + }, + { + "epoch": 4.578274760383387, + "grad_norm": 0.1564428210258484, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5732 + }, + { + "epoch": 4.5790734824281145, + "grad_norm": 0.21137529611587524, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5733 + }, + { + "epoch": 4.579872204472844, + "grad_norm": 0.13836248219013214, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5734 + }, + { + "epoch": 4.580670926517572, + "grad_norm": 0.11749537289142609, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5735 + }, + { + "epoch": 4.5814696485623, + "grad_norm": 0.10901704430580139, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5736 + }, + { + "epoch": 4.582268370607029, + "grad_norm": 0.08402425795793533, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5737 + }, + { + "epoch": 4.583067092651757, + "grad_norm": 0.1502164900302887, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5738 + }, + { + "epoch": 4.583865814696486, + "grad_norm": 0.10606876760721207, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5739 + }, + { + "epoch": 4.584664536741214, + "grad_norm": 0.11868279427289963, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5740 + }, + { + "epoch": 4.585463258785943, + "grad_norm": 0.10678767412900925, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5741 + }, + { + "epoch": 4.586261980830671, + "grad_norm": 0.28886285424232483, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5742 + }, + { + "epoch": 4.5870607028754, + "grad_norm": 0.3516097366809845, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5743 + }, + { + "epoch": 4.587859424920127, + "grad_norm": 0.10221854597330093, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5744 + }, + { + "epoch": 4.588658146964856, + "grad_norm": 0.24786177277565002, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5745 + }, + { + "epoch": 4.5894568690095845, + "grad_norm": 0.10537181794643402, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5746 + }, + { + "epoch": 4.590255591054313, + "grad_norm": 0.23574885725975037, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5747 + }, + { + "epoch": 4.5910543130990416, + "grad_norm": 0.1483563631772995, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5748 + }, + { + "epoch": 4.59185303514377, + "grad_norm": 0.1516815721988678, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 5749 + }, + { + "epoch": 4.592651757188499, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5750 + }, + { + "epoch": 4.593450479233227, + "grad_norm": 0.10706239938735962, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5751 + }, + { + "epoch": 4.594249201277956, + "grad_norm": 1.081868290901184, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5752 + }, + { + "epoch": 4.595047923322683, + "grad_norm": 0.4016919732093811, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5753 + }, + { + "epoch": 4.595846645367412, + "grad_norm": 0.3266371786594391, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5754 + }, + { + "epoch": 4.59664536741214, + "grad_norm": 0.23380769789218903, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5755 + }, + { + "epoch": 4.597444089456869, + "grad_norm": 0.2521349787712097, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 5756 + }, + { + "epoch": 4.598242811501597, + "grad_norm": 0.2223331481218338, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5757 + }, + { + "epoch": 4.599041533546326, + "grad_norm": 0.177442729473114, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 5758 + }, + { + "epoch": 4.5998402555910545, + "grad_norm": 0.18474844098091125, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 5759 + }, + { + "epoch": 4.600638977635783, + "grad_norm": 0.1686495542526245, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5760 + }, + { + "epoch": 4.6014376996805115, + "grad_norm": 0.13674414157867432, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5761 + }, + { + "epoch": 4.602236421725239, + "grad_norm": 0.1390203833580017, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 5762 + }, + { + "epoch": 4.603035143769968, + "grad_norm": 0.10701096057891846, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5763 + }, + { + "epoch": 4.603833865814696, + "grad_norm": 0.110149085521698, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5764 + }, + { + "epoch": 4.604632587859425, + "grad_norm": 0.2477579116821289, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5765 + }, + { + "epoch": 4.605431309904153, + "grad_norm": 0.2554718852043152, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5766 + }, + { + "epoch": 4.606230031948882, + "grad_norm": 0.1945963203907013, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5767 + }, + { + "epoch": 4.60702875399361, + "grad_norm": 0.26785531640052795, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5768 + }, + { + "epoch": 4.607827476038339, + "grad_norm": 0.3007332980632782, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5769 + }, + { + "epoch": 4.608626198083067, + "grad_norm": 0.09973788261413574, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5770 + }, + { + "epoch": 4.609424920127795, + "grad_norm": 0.09176181256771088, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5771 + }, + { + "epoch": 4.6102236421725244, + "grad_norm": 0.1395607590675354, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5772 + }, + { + "epoch": 4.611022364217252, + "grad_norm": 0.8938566446304321, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5773 + }, + { + "epoch": 4.611821086261981, + "grad_norm": 0.3093889653682709, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5774 + }, + { + "epoch": 4.612619808306709, + "grad_norm": 0.1910911351442337, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5775 + }, + { + "epoch": 4.613418530351438, + "grad_norm": 0.11586496978998184, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5776 + }, + { + "epoch": 4.614217252396166, + "grad_norm": 0.222470223903656, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 5777 + }, + { + "epoch": 4.615015974440895, + "grad_norm": 0.16580955684185028, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5778 + }, + { + "epoch": 4.615814696485623, + "grad_norm": 0.11279458552598953, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5779 + }, + { + "epoch": 4.616613418530352, + "grad_norm": 0.10970400273799896, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5780 + }, + { + "epoch": 4.61741214057508, + "grad_norm": 0.11291752755641937, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5781 + }, + { + "epoch": 4.618210862619808, + "grad_norm": 0.19262762367725372, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5782 + }, + { + "epoch": 4.6190095846645365, + "grad_norm": 0.12736102938652039, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5783 + }, + { + "epoch": 4.619808306709265, + "grad_norm": 0.09300720691680908, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5784 + }, + { + "epoch": 4.6206070287539935, + "grad_norm": 0.09544654190540314, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5785 + }, + { + "epoch": 4.621405750798722, + "grad_norm": 0.2888239026069641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5786 + }, + { + "epoch": 4.622204472843451, + "grad_norm": 0.22988484799861908, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5787 + }, + { + "epoch": 4.623003194888179, + "grad_norm": 0.2574143707752228, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5788 + }, + { + "epoch": 4.623801916932908, + "grad_norm": 0.2503221333026886, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5789 + }, + { + "epoch": 4.624600638977636, + "grad_norm": 0.20846052467823029, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5790 + }, + { + "epoch": 4.625399361022364, + "grad_norm": 0.218403160572052, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5791 + }, + { + "epoch": 4.626198083067092, + "grad_norm": 0.11333920061588287, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5792 + }, + { + "epoch": 4.626996805111821, + "grad_norm": 0.19022895395755768, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5793 + }, + { + "epoch": 4.627795527156549, + "grad_norm": 0.1525644063949585, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 5794 + }, + { + "epoch": 4.628594249201278, + "grad_norm": 0.07636452466249466, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5795 + }, + { + "epoch": 4.6293929712460065, + "grad_norm": 0.1358552873134613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5796 + }, + { + "epoch": 4.630191693290735, + "grad_norm": 0.08993138372898102, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5797 + }, + { + "epoch": 4.6309904153354635, + "grad_norm": 0.15454545617103577, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5798 + }, + { + "epoch": 4.631789137380192, + "grad_norm": 0.12256992608308792, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5799 + }, + { + "epoch": 4.63258785942492, + "grad_norm": 0.08453187346458435, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5800 + }, + { + "epoch": 4.633386581469648, + "grad_norm": 0.1474936157464981, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5801 + }, + { + "epoch": 4.634185303514377, + "grad_norm": 0.11481066793203354, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5802 + }, + { + "epoch": 4.634984025559105, + "grad_norm": 0.41141587495803833, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5803 + }, + { + "epoch": 4.635782747603834, + "grad_norm": 0.1509549766778946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5804 + }, + { + "epoch": 4.636581469648562, + "grad_norm": 0.13562771677970886, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5805 + }, + { + "epoch": 4.637380191693291, + "grad_norm": 0.09722459316253662, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5806 + }, + { + "epoch": 4.638178913738019, + "grad_norm": 0.3194493353366852, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5807 + }, + { + "epoch": 4.638977635782748, + "grad_norm": 0.23091651499271393, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5808 + }, + { + "epoch": 4.6397763578274756, + "grad_norm": 0.1682155877351761, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5809 + }, + { + "epoch": 4.640575079872205, + "grad_norm": 0.37293288111686707, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5810 + }, + { + "epoch": 4.641373801916933, + "grad_norm": 0.3746488094329834, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5811 + }, + { + "epoch": 4.642172523961661, + "grad_norm": 0.2068052738904953, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5812 + }, + { + "epoch": 4.64297124600639, + "grad_norm": 0.13229581713676453, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5813 + }, + { + "epoch": 4.643769968051118, + "grad_norm": 0.24158459901809692, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5814 + }, + { + "epoch": 4.644568690095847, + "grad_norm": 0.4241867959499359, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5815 + }, + { + "epoch": 4.645367412140575, + "grad_norm": 0.40008923411369324, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5816 + }, + { + "epoch": 4.646166134185304, + "grad_norm": 0.3150584101676941, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 5817 + }, + { + "epoch": 4.646964856230032, + "grad_norm": 0.11021434515714645, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5818 + }, + { + "epoch": 4.647763578274761, + "grad_norm": 0.30061402916908264, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5819 + }, + { + "epoch": 4.6485623003194885, + "grad_norm": 0.12583592534065247, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5820 + }, + { + "epoch": 4.649361022364217, + "grad_norm": 0.31917983293533325, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5821 + }, + { + "epoch": 4.6501597444089455, + "grad_norm": 0.2097153663635254, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5822 + }, + { + "epoch": 4.650958466453674, + "grad_norm": 0.19847621023654938, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5823 + }, + { + "epoch": 4.651757188498403, + "grad_norm": 0.2482050508260727, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5824 + }, + { + "epoch": 4.652555910543131, + "grad_norm": 0.1257491409778595, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5825 + }, + { + "epoch": 4.65335463258786, + "grad_norm": 0.2192201465368271, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5826 + }, + { + "epoch": 4.654153354632588, + "grad_norm": 0.16453656554222107, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5827 + }, + { + "epoch": 4.654952076677317, + "grad_norm": 0.18813923001289368, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5828 + }, + { + "epoch": 4.655750798722044, + "grad_norm": 0.1811141073703766, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5829 + }, + { + "epoch": 4.656549520766773, + "grad_norm": 0.08911352604627609, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5830 + }, + { + "epoch": 4.657348242811501, + "grad_norm": 0.17858019471168518, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5831 + }, + { + "epoch": 4.65814696485623, + "grad_norm": 0.27315759658813477, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5832 + }, + { + "epoch": 4.6589456869009584, + "grad_norm": 0.18612337112426758, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5833 + }, + { + "epoch": 4.659744408945687, + "grad_norm": 0.2646125257015228, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5834 + }, + { + "epoch": 4.6605431309904155, + "grad_norm": 0.07320903241634369, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5835 + }, + { + "epoch": 4.661341853035144, + "grad_norm": 0.12969297170639038, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5836 + }, + { + "epoch": 4.662140575079873, + "grad_norm": 0.37665078043937683, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5837 + }, + { + "epoch": 4.6629392971246, + "grad_norm": 0.11055029928684235, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 5838 + }, + { + "epoch": 4.663738019169329, + "grad_norm": 0.12279482185840607, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5839 + }, + { + "epoch": 4.664536741214057, + "grad_norm": 0.0686316192150116, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5840 + }, + { + "epoch": 4.665335463258786, + "grad_norm": 0.09705425798892975, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5841 + }, + { + "epoch": 4.666134185303514, + "grad_norm": 0.09543570131063461, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5842 + }, + { + "epoch": 4.666932907348243, + "grad_norm": 0.08460460603237152, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5843 + }, + { + "epoch": 4.667731629392971, + "grad_norm": 0.12419378757476807, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5844 + }, + { + "epoch": 4.6685303514377, + "grad_norm": 0.09184019267559052, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5845 + }, + { + "epoch": 4.669329073482428, + "grad_norm": 0.09425100684165955, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5846 + }, + { + "epoch": 4.670127795527156, + "grad_norm": 0.19701971113681793, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5847 + }, + { + "epoch": 4.6709265175718855, + "grad_norm": 0.0648239254951477, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5848 + }, + { + "epoch": 4.671725239616613, + "grad_norm": 0.11558888107538223, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5849 + }, + { + "epoch": 4.672523961661342, + "grad_norm": 0.12397976219654083, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5850 + }, + { + "epoch": 4.67332268370607, + "grad_norm": 0.10640132427215576, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5851 + }, + { + "epoch": 4.674121405750799, + "grad_norm": 0.08930578827857971, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5852 + }, + { + "epoch": 4.674920127795527, + "grad_norm": 0.06212310120463371, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5853 + }, + { + "epoch": 4.675718849840256, + "grad_norm": 0.08568188548088074, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5854 + }, + { + "epoch": 4.676517571884984, + "grad_norm": 0.11431021988391876, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5855 + }, + { + "epoch": 4.677316293929713, + "grad_norm": 0.34381258487701416, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5856 + }, + { + "epoch": 4.678115015974441, + "grad_norm": 0.1996181309223175, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5857 + }, + { + "epoch": 4.678913738019169, + "grad_norm": 0.2900290787220001, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5858 + }, + { + "epoch": 4.6797124600638975, + "grad_norm": 0.35768410563468933, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5859 + }, + { + "epoch": 4.680511182108626, + "grad_norm": 0.1027536615729332, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5860 + }, + { + "epoch": 4.681309904153355, + "grad_norm": 0.6286419630050659, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5861 + }, + { + "epoch": 4.682108626198083, + "grad_norm": 0.5037242770195007, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5862 + }, + { + "epoch": 4.682907348242812, + "grad_norm": 0.34654417634010315, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5863 + }, + { + "epoch": 4.68370607028754, + "grad_norm": 0.18139366805553436, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5864 + }, + { + "epoch": 4.684504792332269, + "grad_norm": 0.2101605087518692, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5865 + }, + { + "epoch": 4.685303514376997, + "grad_norm": 0.0922360047698021, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5866 + }, + { + "epoch": 4.686102236421725, + "grad_norm": 0.23476624488830566, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5867 + }, + { + "epoch": 4.686900958466453, + "grad_norm": 0.1843792051076889, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5868 + }, + { + "epoch": 4.687699680511182, + "grad_norm": 0.09449298679828644, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5869 + }, + { + "epoch": 4.68849840255591, + "grad_norm": 0.13996686041355133, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5870 + }, + { + "epoch": 4.689297124600639, + "grad_norm": 2.113325357437134, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5871 + }, + { + "epoch": 4.6900958466453675, + "grad_norm": 0.35181209444999695, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 5872 + }, + { + "epoch": 4.690894568690096, + "grad_norm": 0.3530768156051636, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 5873 + }, + { + "epoch": 4.6916932907348246, + "grad_norm": 0.25919783115386963, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5874 + }, + { + "epoch": 4.692492012779553, + "grad_norm": 0.19770720601081848, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 5875 + }, + { + "epoch": 4.693290734824281, + "grad_norm": 0.32085585594177246, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5876 + }, + { + "epoch": 4.694089456869009, + "grad_norm": 0.14215363562107086, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5877 + }, + { + "epoch": 4.694888178913738, + "grad_norm": 0.24502497911453247, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5878 + }, + { + "epoch": 4.695686900958466, + "grad_norm": 0.15765784680843353, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5879 + }, + { + "epoch": 4.696485623003195, + "grad_norm": 0.13945002853870392, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5880 + }, + { + "epoch": 4.697284345047923, + "grad_norm": 0.16315795481204987, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5881 + }, + { + "epoch": 4.698083067092652, + "grad_norm": 0.0803297907114029, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5882 + }, + { + "epoch": 4.69888178913738, + "grad_norm": 0.09848042577505112, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5883 + }, + { + "epoch": 4.699680511182109, + "grad_norm": 0.22370465099811554, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5884 + }, + { + "epoch": 4.700479233226837, + "grad_norm": 0.09369395673274994, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5885 + }, + { + "epoch": 4.701277955271565, + "grad_norm": 0.42340102791786194, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5886 + }, + { + "epoch": 4.702076677316294, + "grad_norm": 0.08471440523862839, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5887 + }, + { + "epoch": 4.702875399361022, + "grad_norm": 0.11350758373737335, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5888 + }, + { + "epoch": 4.703674121405751, + "grad_norm": 0.16862216591835022, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5889 + }, + { + "epoch": 4.704472843450479, + "grad_norm": 0.17468953132629395, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5890 + }, + { + "epoch": 4.705271565495208, + "grad_norm": 0.09154370427131653, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5891 + }, + { + "epoch": 4.706070287539936, + "grad_norm": 0.08715084940195084, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5892 + }, + { + "epoch": 4.706869009584665, + "grad_norm": 0.06797291338443756, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5893 + }, + { + "epoch": 4.707667731629393, + "grad_norm": 0.17333610355854034, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5894 + }, + { + "epoch": 4.708466453674122, + "grad_norm": 0.17272767424583435, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5895 + }, + { + "epoch": 4.7092651757188495, + "grad_norm": 0.11773357540369034, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5896 + }, + { + "epoch": 4.710063897763578, + "grad_norm": 0.08420758694410324, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5897 + }, + { + "epoch": 4.710862619808307, + "grad_norm": 0.08672801405191422, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5898 + }, + { + "epoch": 4.711661341853035, + "grad_norm": 0.2356635183095932, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5899 + }, + { + "epoch": 4.712460063897764, + "grad_norm": 0.06091082841157913, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5900 + }, + { + "epoch": 4.713258785942492, + "grad_norm": 0.09156842529773712, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5901 + }, + { + "epoch": 4.714057507987221, + "grad_norm": 0.06548108160495758, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5902 + }, + { + "epoch": 4.714856230031949, + "grad_norm": 0.12813016772270203, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5903 + }, + { + "epoch": 4.715654952076678, + "grad_norm": 0.1518833339214325, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5904 + }, + { + "epoch": 4.716453674121405, + "grad_norm": 0.09331580996513367, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5905 + }, + { + "epoch": 4.717252396166134, + "grad_norm": 0.11989843845367432, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5906 + }, + { + "epoch": 4.718051118210862, + "grad_norm": 0.1277054399251938, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5907 + }, + { + "epoch": 4.718849840255591, + "grad_norm": 0.11199159920215607, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5908 + }, + { + "epoch": 4.7196485623003195, + "grad_norm": 0.09120891988277435, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5909 + }, + { + "epoch": 4.720447284345048, + "grad_norm": 0.11668230593204498, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5910 + }, + { + "epoch": 4.7212460063897765, + "grad_norm": 0.08594206720590591, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5911 + }, + { + "epoch": 4.722044728434505, + "grad_norm": 0.11563027650117874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5912 + }, + { + "epoch": 4.722843450479234, + "grad_norm": 0.15066663920879364, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5913 + }, + { + "epoch": 4.723642172523961, + "grad_norm": 0.08566875755786896, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5914 + }, + { + "epoch": 4.72444089456869, + "grad_norm": 0.060813747346401215, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5915 + }, + { + "epoch": 4.725239616613418, + "grad_norm": 0.07391642779111862, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5916 + }, + { + "epoch": 4.726038338658147, + "grad_norm": 0.04867766425013542, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5917 + }, + { + "epoch": 4.726837060702875, + "grad_norm": 0.09468305110931396, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5918 + }, + { + "epoch": 4.727635782747604, + "grad_norm": 0.07287945598363876, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5919 + }, + { + "epoch": 4.728434504792332, + "grad_norm": 0.08984806388616562, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5920 + }, + { + "epoch": 4.729233226837061, + "grad_norm": 0.1755092740058899, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5921 + }, + { + "epoch": 4.7300319488817895, + "grad_norm": 0.09656399488449097, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5922 + }, + { + "epoch": 4.730830670926517, + "grad_norm": 0.15759015083312988, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5923 + }, + { + "epoch": 4.731629392971246, + "grad_norm": 0.13238383829593658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5924 + }, + { + "epoch": 4.732428115015974, + "grad_norm": 0.05352601036429405, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5925 + }, + { + "epoch": 4.733226837060703, + "grad_norm": 0.06253937631845474, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5926 + }, + { + "epoch": 4.734025559105431, + "grad_norm": 0.057317376136779785, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5927 + }, + { + "epoch": 4.73482428115016, + "grad_norm": 0.12154382467269897, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5928 + }, + { + "epoch": 4.735623003194888, + "grad_norm": 0.0547759085893631, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5929 + }, + { + "epoch": 4.736421725239617, + "grad_norm": 0.07446085661649704, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5930 + }, + { + "epoch": 4.737220447284345, + "grad_norm": 0.09809007495641708, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5931 + }, + { + "epoch": 4.738019169329074, + "grad_norm": 0.12434732168912888, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5932 + }, + { + "epoch": 4.738817891373802, + "grad_norm": 0.12192053347826004, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5933 + }, + { + "epoch": 4.73961661341853, + "grad_norm": 0.08006733655929565, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5934 + }, + { + "epoch": 4.7404153354632586, + "grad_norm": 0.14677436649799347, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5935 + }, + { + "epoch": 4.741214057507987, + "grad_norm": 0.10133987665176392, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5936 + }, + { + "epoch": 4.742012779552716, + "grad_norm": 0.10331577062606812, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5937 + }, + { + "epoch": 4.742811501597444, + "grad_norm": 0.14596082270145416, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5938 + }, + { + "epoch": 4.743610223642173, + "grad_norm": 0.15139590203762054, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5939 + }, + { + "epoch": 4.744408945686901, + "grad_norm": 0.0935182124376297, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5940 + }, + { + "epoch": 4.74520766773163, + "grad_norm": 0.1002865880727768, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5941 + }, + { + "epoch": 4.746006389776358, + "grad_norm": 0.0968283861875534, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5942 + }, + { + "epoch": 4.746805111821086, + "grad_norm": 0.11680585891008377, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5943 + }, + { + "epoch": 4.747603833865814, + "grad_norm": 0.12163184583187103, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 5944 + }, + { + "epoch": 4.748402555910543, + "grad_norm": 0.07288502901792526, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5945 + }, + { + "epoch": 4.7492012779552715, + "grad_norm": 0.3335740566253662, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5946 + }, + { + "epoch": 4.75, + "grad_norm": 0.15408654510974884, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5947 + }, + { + "epoch": 4.7507987220447285, + "grad_norm": 0.09612353891134262, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5948 + }, + { + "epoch": 4.751597444089457, + "grad_norm": 0.10403789579868317, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5949 + }, + { + "epoch": 4.752396166134186, + "grad_norm": 0.13026492297649384, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5950 + }, + { + "epoch": 4.753194888178914, + "grad_norm": 0.061955004930496216, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5951 + }, + { + "epoch": 4.753993610223642, + "grad_norm": 0.08264514058828354, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5952 + }, + { + "epoch": 4.75479233226837, + "grad_norm": 0.1132993996143341, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5953 + }, + { + "epoch": 4.755591054313099, + "grad_norm": 0.09022228419780731, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5954 + }, + { + "epoch": 4.756389776357827, + "grad_norm": 0.13192631304264069, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5955 + }, + { + "epoch": 4.757188498402556, + "grad_norm": 0.08400337398052216, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5956 + }, + { + "epoch": 4.757987220447284, + "grad_norm": 0.05070018023252487, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5957 + }, + { + "epoch": 4.758785942492013, + "grad_norm": 0.09561482816934586, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5958 + }, + { + "epoch": 4.7595846645367414, + "grad_norm": 0.07369764894247055, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5959 + }, + { + "epoch": 4.76038338658147, + "grad_norm": 0.07777421176433563, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5960 + }, + { + "epoch": 4.761182108626198, + "grad_norm": 0.11525892466306686, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5961 + }, + { + "epoch": 4.761980830670926, + "grad_norm": 0.1788506656885147, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5962 + }, + { + "epoch": 4.762779552715655, + "grad_norm": 0.10067635029554367, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5963 + }, + { + "epoch": 4.763578274760383, + "grad_norm": 0.08447863161563873, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5964 + }, + { + "epoch": 4.764376996805112, + "grad_norm": 0.06801758706569672, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5965 + }, + { + "epoch": 4.76517571884984, + "grad_norm": 0.07363327592611313, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5966 + }, + { + "epoch": 4.765974440894569, + "grad_norm": 0.05584784597158432, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5967 + }, + { + "epoch": 4.766773162939297, + "grad_norm": 0.10064459592103958, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5968 + }, + { + "epoch": 4.767571884984026, + "grad_norm": 0.1176871508359909, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5969 + }, + { + "epoch": 4.768370607028754, + "grad_norm": 0.17485690116882324, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5970 + }, + { + "epoch": 4.769169329073483, + "grad_norm": 0.15753531455993652, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5971 + }, + { + "epoch": 4.7699680511182105, + "grad_norm": 0.1669864058494568, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5972 + }, + { + "epoch": 4.770766773162939, + "grad_norm": 0.07706131786108017, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5973 + }, + { + "epoch": 4.771565495207668, + "grad_norm": 0.3537883460521698, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5974 + }, + { + "epoch": 4.772364217252396, + "grad_norm": 0.20092372596263885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5975 + }, + { + "epoch": 4.773162939297125, + "grad_norm": 0.06521142274141312, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5976 + }, + { + "epoch": 4.773961661341853, + "grad_norm": 0.1203140988945961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5977 + }, + { + "epoch": 4.774760383386582, + "grad_norm": 0.09655500948429108, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5978 + }, + { + "epoch": 4.77555910543131, + "grad_norm": 0.09220302104949951, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5979 + }, + { + "epoch": 4.776357827476039, + "grad_norm": 0.7336251735687256, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5980 + }, + { + "epoch": 4.777156549520766, + "grad_norm": 0.21415477991104126, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5981 + }, + { + "epoch": 4.777955271565495, + "grad_norm": 0.14869220554828644, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5982 + }, + { + "epoch": 4.7787539936102235, + "grad_norm": 0.0779772400856018, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5983 + }, + { + "epoch": 4.779552715654952, + "grad_norm": 0.14274317026138306, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5984 + }, + { + "epoch": 4.7803514376996805, + "grad_norm": 0.11580413579940796, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5985 + }, + { + "epoch": 4.781150159744409, + "grad_norm": 0.055023401975631714, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5986 + }, + { + "epoch": 4.781948881789138, + "grad_norm": 0.11657343804836273, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5987 + }, + { + "epoch": 4.782747603833866, + "grad_norm": 0.07336080819368362, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5988 + }, + { + "epoch": 4.783546325878595, + "grad_norm": 0.06066504120826721, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5989 + }, + { + "epoch": 4.784345047923322, + "grad_norm": 0.05784285068511963, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5990 + }, + { + "epoch": 4.785143769968051, + "grad_norm": 0.06317969411611557, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5991 + }, + { + "epoch": 4.785942492012779, + "grad_norm": 0.1001245379447937, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5992 + }, + { + "epoch": 4.786741214057508, + "grad_norm": 0.0743420347571373, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5993 + }, + { + "epoch": 4.787539936102236, + "grad_norm": 0.07082799077033997, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5994 + }, + { + "epoch": 4.788338658146965, + "grad_norm": 0.11087984591722488, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5995 + }, + { + "epoch": 4.789137380191693, + "grad_norm": 0.05923386290669441, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5996 + }, + { + "epoch": 4.789936102236422, + "grad_norm": 0.1020246297121048, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5997 + }, + { + "epoch": 4.7907348242811505, + "grad_norm": 0.11524185538291931, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5998 + }, + { + "epoch": 4.791533546325878, + "grad_norm": 0.06959006190299988, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5999 + }, + { + "epoch": 4.792332268370607, + "grad_norm": 0.19179846346378326, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6000 + }, + { + "epoch": 4.793130990415335, + "grad_norm": 0.17232562601566315, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6001 + }, + { + "epoch": 4.793929712460064, + "grad_norm": 0.7047739028930664, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6002 + }, + { + "epoch": 4.794728434504792, + "grad_norm": 0.09086379408836365, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6003 + }, + { + "epoch": 4.795527156549521, + "grad_norm": 0.17785955965518951, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6004 + }, + { + "epoch": 4.796325878594249, + "grad_norm": 0.09529274702072144, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6005 + }, + { + "epoch": 4.797124600638978, + "grad_norm": 0.08041567355394363, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6006 + }, + { + "epoch": 4.797923322683706, + "grad_norm": 0.13888375461101532, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6007 + }, + { + "epoch": 4.798722044728435, + "grad_norm": 0.08110564947128296, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6008 + }, + { + "epoch": 4.799520766773163, + "grad_norm": 0.07443006336688995, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6009 + }, + { + "epoch": 4.800319488817891, + "grad_norm": 0.08499104529619217, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6010 + }, + { + "epoch": 4.80111821086262, + "grad_norm": 0.0616084523499012, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6011 + }, + { + "epoch": 4.801916932907348, + "grad_norm": 0.10845918208360672, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6012 + }, + { + "epoch": 4.802715654952077, + "grad_norm": 0.057658810168504715, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6013 + }, + { + "epoch": 4.803514376996805, + "grad_norm": 0.07163018733263016, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6014 + }, + { + "epoch": 4.804313099041534, + "grad_norm": 0.07016896456480026, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6015 + }, + { + "epoch": 4.805111821086262, + "grad_norm": 0.08233597129583359, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6016 + }, + { + "epoch": 4.805910543130991, + "grad_norm": 0.05408332124352455, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6017 + }, + { + "epoch": 4.806709265175719, + "grad_norm": 0.0886560007929802, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6018 + }, + { + "epoch": 4.807507987220447, + "grad_norm": 0.17860093712806702, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6019 + }, + { + "epoch": 4.8083067092651754, + "grad_norm": 0.26264694333076477, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6020 + }, + { + "epoch": 4.809105431309904, + "grad_norm": 0.08523311465978622, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6021 + }, + { + "epoch": 4.8099041533546325, + "grad_norm": 0.09873831272125244, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6022 + }, + { + "epoch": 4.810702875399361, + "grad_norm": 0.16135412454605103, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6023 + }, + { + "epoch": 4.81150159744409, + "grad_norm": 0.08003875613212585, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6024 + }, + { + "epoch": 4.812300319488818, + "grad_norm": 0.09117014706134796, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6025 + }, + { + "epoch": 4.813099041533547, + "grad_norm": 0.2316243052482605, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6026 + }, + { + "epoch": 4.813897763578275, + "grad_norm": 0.16050362586975098, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6027 + }, + { + "epoch": 4.814696485623003, + "grad_norm": 0.13559919595718384, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6028 + }, + { + "epoch": 4.815495207667731, + "grad_norm": 0.08917123824357986, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6029 + }, + { + "epoch": 4.81629392971246, + "grad_norm": 0.11498702317476273, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6030 + }, + { + "epoch": 4.817092651757188, + "grad_norm": 0.14677700400352478, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6031 + }, + { + "epoch": 4.817891373801917, + "grad_norm": 0.08849102258682251, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6032 + }, + { + "epoch": 4.818690095846645, + "grad_norm": 4.0974507331848145, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6033 + }, + { + "epoch": 4.819488817891374, + "grad_norm": 0.24215161800384521, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6034 + }, + { + "epoch": 4.8202875399361025, + "grad_norm": 0.2679882049560547, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6035 + }, + { + "epoch": 4.821086261980831, + "grad_norm": 0.11113203316926956, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6036 + }, + { + "epoch": 4.821884984025559, + "grad_norm": 0.17725592851638794, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6037 + }, + { + "epoch": 4.822683706070287, + "grad_norm": 0.08446694165468216, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6038 + }, + { + "epoch": 4.823482428115016, + "grad_norm": 0.26757946610450745, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6039 + }, + { + "epoch": 4.824281150159744, + "grad_norm": 0.1900561898946762, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6040 + }, + { + "epoch": 4.825079872204473, + "grad_norm": 0.21993426978588104, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6041 + }, + { + "epoch": 4.825878594249201, + "grad_norm": 15.862943649291992, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6042 + }, + { + "epoch": 4.82667731629393, + "grad_norm": 0.793515145778656, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6043 + }, + { + "epoch": 4.827476038338658, + "grad_norm": 0.5607691407203674, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6044 + }, + { + "epoch": 4.828274760383387, + "grad_norm": 0.2853091359138489, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6045 + }, + { + "epoch": 4.8290734824281145, + "grad_norm": 0.3579944670200348, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6046 + }, + { + "epoch": 4.829872204472844, + "grad_norm": 0.26784929633140564, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6047 + }, + { + "epoch": 4.830670926517572, + "grad_norm": 0.2363428920507431, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6048 + }, + { + "epoch": 4.8314696485623, + "grad_norm": 0.2922425866127014, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6049 + }, + { + "epoch": 4.832268370607029, + "grad_norm": 0.2173125147819519, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6050 + }, + { + "epoch": 4.833067092651757, + "grad_norm": 0.23552696406841278, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6051 + }, + { + "epoch": 4.833865814696486, + "grad_norm": 1.2383053302764893, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6052 + }, + { + "epoch": 4.834664536741214, + "grad_norm": 0.3284873366355896, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6053 + }, + { + "epoch": 4.835463258785943, + "grad_norm": 0.15584628283977509, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6054 + }, + { + "epoch": 4.836261980830671, + "grad_norm": 0.3136327862739563, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6055 + }, + { + "epoch": 4.8370607028754, + "grad_norm": 0.19863441586494446, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6056 + }, + { + "epoch": 4.837859424920127, + "grad_norm": 0.273644357919693, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6057 + }, + { + "epoch": 4.838658146964856, + "grad_norm": 0.2560950517654419, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6058 + }, + { + "epoch": 4.8394568690095845, + "grad_norm": 0.2243220955133438, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6059 + }, + { + "epoch": 4.840255591054313, + "grad_norm": 0.16328522562980652, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6060 + }, + { + "epoch": 4.8410543130990416, + "grad_norm": 0.42267754673957825, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6061 + }, + { + "epoch": 4.84185303514377, + "grad_norm": 0.21733495593070984, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6062 + }, + { + "epoch": 4.842651757188499, + "grad_norm": 0.12917862832546234, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6063 + }, + { + "epoch": 4.843450479233227, + "grad_norm": 0.1829921007156372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6064 + }, + { + "epoch": 4.844249201277956, + "grad_norm": 0.08751819282770157, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6065 + }, + { + "epoch": 4.845047923322683, + "grad_norm": 0.16521455347537994, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6066 + }, + { + "epoch": 4.845846645367412, + "grad_norm": 0.4328543543815613, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6067 + }, + { + "epoch": 4.84664536741214, + "grad_norm": 0.2682073712348938, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6068 + }, + { + "epoch": 4.847444089456869, + "grad_norm": 0.15217293798923492, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6069 + }, + { + "epoch": 4.848242811501597, + "grad_norm": 0.12807190418243408, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 6070 + }, + { + "epoch": 4.849041533546326, + "grad_norm": 1.4503207206726074, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6071 + }, + { + "epoch": 4.8498402555910545, + "grad_norm": 0.5045278668403625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6072 + }, + { + "epoch": 4.850638977635783, + "grad_norm": 0.1992882788181305, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6073 + }, + { + "epoch": 4.8514376996805115, + "grad_norm": 0.3178166151046753, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 6074 + }, + { + "epoch": 4.852236421725239, + "grad_norm": 0.1244354322552681, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6075 + }, + { + "epoch": 4.853035143769968, + "grad_norm": 0.2837885320186615, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 6076 + }, + { + "epoch": 4.853833865814696, + "grad_norm": 0.11910229921340942, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6077 + }, + { + "epoch": 4.854632587859425, + "grad_norm": 0.5774815678596497, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6078 + }, + { + "epoch": 4.855431309904153, + "grad_norm": 0.13028140366077423, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6079 + }, + { + "epoch": 4.856230031948882, + "grad_norm": 0.21022816002368927, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6080 + }, + { + "epoch": 4.85702875399361, + "grad_norm": 0.11758062243461609, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6081 + }, + { + "epoch": 4.857827476038339, + "grad_norm": 0.1321621984243393, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6082 + }, + { + "epoch": 4.858626198083067, + "grad_norm": 0.11481605470180511, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6083 + }, + { + "epoch": 4.859424920127795, + "grad_norm": 0.0976998507976532, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6084 + }, + { + "epoch": 4.8602236421725244, + "grad_norm": 0.7211679220199585, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6085 + }, + { + "epoch": 4.861022364217252, + "grad_norm": 0.1417546272277832, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6086 + }, + { + "epoch": 4.861821086261981, + "grad_norm": 0.13830699026584625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6087 + }, + { + "epoch": 4.862619808306709, + "grad_norm": 0.24840030074119568, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6088 + }, + { + "epoch": 4.863418530351438, + "grad_norm": 3.442054033279419, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6089 + }, + { + "epoch": 4.864217252396166, + "grad_norm": 0.21404840052127838, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 6090 + }, + { + "epoch": 4.865015974440895, + "grad_norm": 0.3657711148262024, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6091 + }, + { + "epoch": 4.865814696485623, + "grad_norm": 0.2189537137746811, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6092 + }, + { + "epoch": 4.866613418530352, + "grad_norm": 0.17866109311580658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6093 + }, + { + "epoch": 4.86741214057508, + "grad_norm": 0.19208978116512299, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6094 + }, + { + "epoch": 4.868210862619808, + "grad_norm": 0.08330709487199783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6095 + }, + { + "epoch": 4.8690095846645365, + "grad_norm": 0.1194678544998169, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6096 + }, + { + "epoch": 4.869808306709265, + "grad_norm": 0.07852908223867416, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6097 + }, + { + "epoch": 4.8706070287539935, + "grad_norm": 0.09230814129114151, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 6098 + }, + { + "epoch": 4.871405750798722, + "grad_norm": 0.06775277107954025, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6099 + }, + { + "epoch": 4.872204472843451, + "grad_norm": 0.28747716546058655, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6100 + }, + { + "epoch": 4.873003194888179, + "grad_norm": 0.11956486105918884, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6101 + }, + { + "epoch": 4.873801916932908, + "grad_norm": 0.09843557327985764, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6102 + }, + { + "epoch": 4.874600638977636, + "grad_norm": 0.08408313244581223, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6103 + }, + { + "epoch": 4.875399361022364, + "grad_norm": 0.08230917155742645, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6104 + }, + { + "epoch": 4.876198083067092, + "grad_norm": 0.08927451819181442, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6105 + }, + { + "epoch": 4.876996805111821, + "grad_norm": 0.5961875319480896, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6106 + }, + { + "epoch": 4.877795527156549, + "grad_norm": 0.5851842164993286, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 6107 + }, + { + "epoch": 4.878594249201278, + "grad_norm": 0.4428717792034149, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6108 + }, + { + "epoch": 4.8793929712460065, + "grad_norm": 3.760467052459717, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 6109 + }, + { + "epoch": 4.880191693290735, + "grad_norm": 84.49950408935547, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 6110 + }, + { + "epoch": 4.8809904153354635, + "grad_norm": 66320516.0, + "learning_rate": 0.0005, + "loss": 1.1423, + "step": 6111 + }, + { + "epoch": 4.881789137380192, + "grad_norm": 676613568.0, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 6112 + }, + { + "epoch": 4.88258785942492, + "grad_norm": 2556641280.0, + "learning_rate": 0.0005, + "loss": 1.2458, + "step": 6113 + }, + { + "epoch": 4.883386581469648, + "grad_norm": 21960.341796875, + "learning_rate": 0.0005, + "loss": 1.3163, + "step": 6114 + }, + { + "epoch": 4.884185303514377, + "grad_norm": 3668.3603515625, + "learning_rate": 0.0005, + "loss": 1.4954, + "step": 6115 + }, + { + "epoch": 4.884984025559105, + "grad_norm": 9.501830101013184, + "learning_rate": 0.0005, + "loss": 2.0388, + "step": 6116 + }, + { + "epoch": 4.885782747603834, + "grad_norm": 1.9570647478103638, + "learning_rate": 0.0005, + "loss": 1.3693, + "step": 6117 + }, + { + "epoch": 4.886581469648562, + "grad_norm": 0.9678036570549011, + "learning_rate": 0.0005, + "loss": 1.2694, + "step": 6118 + }, + { + "epoch": 4.887380191693291, + "grad_norm": 0.7094120383262634, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 6119 + }, + { + "epoch": 4.888178913738019, + "grad_norm": 0.4029041826725006, + "learning_rate": 0.0005, + "loss": 1.1809, + "step": 6120 + }, + { + "epoch": 4.888977635782748, + "grad_norm": 0.8682520389556885, + "learning_rate": 0.0005, + "loss": 1.1689, + "step": 6121 + }, + { + "epoch": 4.8897763578274756, + "grad_norm": 0.5829207301139832, + "learning_rate": 0.0005, + "loss": 1.1921, + "step": 6122 + }, + { + "epoch": 4.890575079872205, + "grad_norm": 0.5038579702377319, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6123 + }, + { + "epoch": 4.891373801916933, + "grad_norm": 0.532597005367279, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6124 + }, + { + "epoch": 4.892172523961661, + "grad_norm": 0.20122192800045013, + "learning_rate": 0.0005, + "loss": 1.1399, + "step": 6125 + }, + { + "epoch": 4.89297124600639, + "grad_norm": 0.22419369220733643, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 6126 + }, + { + "epoch": 4.893769968051118, + "grad_norm": 0.2319759726524353, + "learning_rate": 0.0005, + "loss": 1.13, + "step": 6127 + }, + { + "epoch": 4.894568690095847, + "grad_norm": 0.18733178079128265, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 6128 + }, + { + "epoch": 4.895367412140575, + "grad_norm": 0.35497167706489563, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 6129 + }, + { + "epoch": 4.896166134185304, + "grad_norm": 0.2551584243774414, + "learning_rate": 0.0005, + "loss": 1.1236, + "step": 6130 + }, + { + "epoch": 4.896964856230032, + "grad_norm": 0.337982714176178, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 6131 + }, + { + "epoch": 4.897763578274761, + "grad_norm": 0.2945634722709656, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 6132 + }, + { + "epoch": 4.8985623003194885, + "grad_norm": 0.2571047842502594, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 6133 + }, + { + "epoch": 4.899361022364217, + "grad_norm": 0.23297041654586792, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 6134 + }, + { + "epoch": 4.9001597444089455, + "grad_norm": 0.24131764471530914, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 6135 + }, + { + "epoch": 4.900958466453674, + "grad_norm": 0.22283275425434113, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 6136 + }, + { + "epoch": 4.901757188498403, + "grad_norm": 0.1691826730966568, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 6137 + }, + { + "epoch": 4.902555910543131, + "grad_norm": 0.1532466858625412, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 6138 + }, + { + "epoch": 4.90335463258786, + "grad_norm": 0.14135177433490753, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 6139 + }, + { + "epoch": 4.904153354632588, + "grad_norm": 0.14410537481307983, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 6140 + }, + { + "epoch": 4.904952076677317, + "grad_norm": 0.1097448468208313, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 6141 + }, + { + "epoch": 4.905750798722044, + "grad_norm": 0.0851673111319542, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 6142 + }, + { + "epoch": 4.906549520766773, + "grad_norm": 0.13842107355594635, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 6143 + }, + { + "epoch": 4.907348242811501, + "grad_norm": 0.15126317739486694, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 6144 + }, + { + "epoch": 4.90814696485623, + "grad_norm": 0.13176177442073822, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6145 + }, + { + "epoch": 4.9089456869009584, + "grad_norm": 0.164788156747818, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 6146 + }, + { + "epoch": 4.909744408945687, + "grad_norm": 0.24943718314170837, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6147 + }, + { + "epoch": 4.9105431309904155, + "grad_norm": 0.4325760304927826, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 6148 + }, + { + "epoch": 4.911341853035144, + "grad_norm": 0.5711309313774109, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 6149 + }, + { + "epoch": 4.912140575079873, + "grad_norm": 0.37636998295783997, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 6150 + }, + { + "epoch": 4.9129392971246, + "grad_norm": 0.2788292169570923, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 6151 + }, + { + "epoch": 4.913738019169329, + "grad_norm": 0.31709909439086914, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 6152 + }, + { + "epoch": 4.914536741214057, + "grad_norm": 0.14585916697978973, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6153 + }, + { + "epoch": 4.915335463258786, + "grad_norm": 0.1302923858165741, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 6154 + }, + { + "epoch": 4.916134185303514, + "grad_norm": 0.16156400740146637, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6155 + }, + { + "epoch": 4.916932907348243, + "grad_norm": 0.2323192059993744, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6156 + }, + { + "epoch": 4.917731629392971, + "grad_norm": 0.17504405975341797, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 6157 + }, + { + "epoch": 4.9185303514377, + "grad_norm": 0.07211807370185852, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6158 + }, + { + "epoch": 4.919329073482428, + "grad_norm": 0.26426371932029724, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6159 + }, + { + "epoch": 4.920127795527156, + "grad_norm": 0.237858384847641, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 6160 + }, + { + "epoch": 4.9209265175718855, + "grad_norm": 0.23863473534584045, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6161 + }, + { + "epoch": 4.921725239616613, + "grad_norm": 0.3053814768791199, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6162 + }, + { + "epoch": 4.922523961661342, + "grad_norm": 0.2143447995185852, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6163 + }, + { + "epoch": 4.92332268370607, + "grad_norm": 0.12295633554458618, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 6164 + }, + { + "epoch": 4.924121405750799, + "grad_norm": 0.11128787696361542, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6165 + }, + { + "epoch": 4.924920127795527, + "grad_norm": 0.158652663230896, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 6166 + }, + { + "epoch": 4.925718849840256, + "grad_norm": 0.17612649500370026, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6167 + }, + { + "epoch": 4.926517571884984, + "grad_norm": 0.12243206799030304, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6168 + }, + { + "epoch": 4.927316293929713, + "grad_norm": 0.12234453856945038, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6169 + }, + { + "epoch": 4.928115015974441, + "grad_norm": 0.1968356966972351, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6170 + }, + { + "epoch": 4.928913738019169, + "grad_norm": 0.17286576330661774, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6171 + }, + { + "epoch": 4.9297124600638975, + "grad_norm": 0.0847749337553978, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6172 + }, + { + "epoch": 4.930511182108626, + "grad_norm": 0.0704331174492836, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6173 + }, + { + "epoch": 4.931309904153355, + "grad_norm": 0.12671123445034027, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6174 + }, + { + "epoch": 4.932108626198083, + "grad_norm": 0.10653524100780487, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6175 + }, + { + "epoch": 4.932907348242812, + "grad_norm": 0.0606958381831646, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6176 + }, + { + "epoch": 4.93370607028754, + "grad_norm": 0.12248247116804123, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6177 + }, + { + "epoch": 4.934504792332269, + "grad_norm": 0.1370074301958084, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6178 + }, + { + "epoch": 4.935303514376997, + "grad_norm": 0.05940835922956467, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6179 + }, + { + "epoch": 4.936102236421725, + "grad_norm": 0.1440308690071106, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6180 + }, + { + "epoch": 4.936900958466453, + "grad_norm": 0.1972372829914093, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6181 + }, + { + "epoch": 4.937699680511182, + "grad_norm": 0.10575850307941437, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6182 + }, + { + "epoch": 4.93849840255591, + "grad_norm": 0.11902400851249695, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6183 + }, + { + "epoch": 4.939297124600639, + "grad_norm": 0.15276090800762177, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6184 + }, + { + "epoch": 4.9400958466453675, + "grad_norm": 0.07495112717151642, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6185 + }, + { + "epoch": 4.940894568690096, + "grad_norm": 0.10652542859315872, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6186 + }, + { + "epoch": 4.9416932907348246, + "grad_norm": 0.11347164958715439, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6187 + }, + { + "epoch": 4.942492012779553, + "grad_norm": 0.19946135580539703, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6188 + }, + { + "epoch": 4.943290734824281, + "grad_norm": 0.0771450325846672, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6189 + }, + { + "epoch": 4.944089456869009, + "grad_norm": 0.1086430475115776, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6190 + }, + { + "epoch": 4.944888178913738, + "grad_norm": 0.08790839463472366, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6191 + }, + { + "epoch": 4.945686900958466, + "grad_norm": 0.22063800692558289, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6192 + }, + { + "epoch": 4.946485623003195, + "grad_norm": 0.22287815809249878, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6193 + }, + { + "epoch": 4.947284345047923, + "grad_norm": 1.695265769958496, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6194 + }, + { + "epoch": 4.948083067092652, + "grad_norm": 0.6316840052604675, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 6195 + }, + { + "epoch": 4.94888178913738, + "grad_norm": 0.35637202858924866, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 6196 + }, + { + "epoch": 4.949680511182109, + "grad_norm": 0.2844616174697876, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 6197 + }, + { + "epoch": 4.950479233226837, + "grad_norm": 0.19614022970199585, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 6198 + }, + { + "epoch": 4.951277955271565, + "grad_norm": 0.3665562868118286, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 6199 + }, + { + "epoch": 4.952076677316294, + "grad_norm": 0.1485169231891632, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 6200 + }, + { + "epoch": 4.952875399361022, + "grad_norm": 0.19647273421287537, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6201 + }, + { + "epoch": 4.953674121405751, + "grad_norm": 0.19809085130691528, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6202 + }, + { + "epoch": 4.954472843450479, + "grad_norm": 0.1129874736070633, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6203 + }, + { + "epoch": 4.955271565495208, + "grad_norm": 0.2082832157611847, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6204 + }, + { + "epoch": 4.956070287539936, + "grad_norm": 0.20414425432682037, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6205 + }, + { + "epoch": 4.956869009584665, + "grad_norm": 0.16667422652244568, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6206 + }, + { + "epoch": 4.957667731629393, + "grad_norm": 0.25111839175224304, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 6207 + }, + { + "epoch": 4.958466453674122, + "grad_norm": 0.16995272040367126, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 6208 + }, + { + "epoch": 4.9592651757188495, + "grad_norm": 0.10725044459104538, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6209 + }, + { + "epoch": 4.960063897763578, + "grad_norm": 0.17728300392627716, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6210 + }, + { + "epoch": 4.960862619808307, + "grad_norm": 0.1334110051393509, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6211 + }, + { + "epoch": 4.961661341853035, + "grad_norm": 0.14835794270038605, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6212 + }, + { + "epoch": 4.962460063897764, + "grad_norm": 0.14602027833461761, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6213 + }, + { + "epoch": 4.963258785942492, + "grad_norm": 0.162953719496727, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6214 + }, + { + "epoch": 4.964057507987221, + "grad_norm": 0.7214393615722656, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6215 + }, + { + "epoch": 4.964856230031949, + "grad_norm": 0.27030259370803833, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6216 + }, + { + "epoch": 4.965654952076678, + "grad_norm": 0.18558967113494873, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 6217 + }, + { + "epoch": 4.966453674121405, + "grad_norm": 0.09276804327964783, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6218 + }, + { + "epoch": 4.967252396166134, + "grad_norm": 0.11957832425832748, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6219 + }, + { + "epoch": 4.968051118210862, + "grad_norm": 0.8338447213172913, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 6220 + }, + { + "epoch": 4.968849840255591, + "grad_norm": 0.7283904552459717, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 6221 + }, + { + "epoch": 4.9696485623003195, + "grad_norm": 0.07938430458307266, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6222 + }, + { + "epoch": 4.970447284345048, + "grad_norm": 0.15368770062923431, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6223 + }, + { + "epoch": 4.9712460063897765, + "grad_norm": 0.08823438733816147, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6224 + }, + { + "epoch": 4.972044728434505, + "grad_norm": 0.07656054943799973, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6225 + }, + { + "epoch": 4.972843450479234, + "grad_norm": 0.08777901530265808, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6226 + }, + { + "epoch": 4.973642172523961, + "grad_norm": 0.09863653033971786, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6227 + }, + { + "epoch": 4.97444089456869, + "grad_norm": 0.13259904086589813, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6228 + }, + { + "epoch": 4.975239616613418, + "grad_norm": 0.08148759603500366, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6229 + }, + { + "epoch": 4.976038338658147, + "grad_norm": 0.06982999294996262, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6230 + }, + { + "epoch": 4.976837060702875, + "grad_norm": 0.09279565513134003, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6231 + }, + { + "epoch": 4.977635782747604, + "grad_norm": 0.05821947008371353, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6232 + }, + { + "epoch": 4.978434504792332, + "grad_norm": 0.07475738972425461, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6233 + }, + { + "epoch": 4.979233226837061, + "grad_norm": 0.10464147478342056, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6234 + }, + { + "epoch": 4.9800319488817895, + "grad_norm": 0.08045687526464462, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6235 + }, + { + "epoch": 4.980830670926517, + "grad_norm": 0.08045300841331482, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6236 + }, + { + "epoch": 4.981629392971246, + "grad_norm": 0.10313838720321655, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6237 + }, + { + "epoch": 4.982428115015974, + "grad_norm": 0.08065208047628403, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6238 + }, + { + "epoch": 4.983226837060703, + "grad_norm": 0.0807032585144043, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6239 + }, + { + "epoch": 4.984025559105431, + "grad_norm": 0.06274307519197464, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6240 + }, + { + "epoch": 4.98482428115016, + "grad_norm": 0.07299554347991943, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6241 + }, + { + "epoch": 4.985623003194888, + "grad_norm": 0.0592481754720211, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6242 + }, + { + "epoch": 4.986421725239617, + "grad_norm": 0.0766056478023529, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6243 + }, + { + "epoch": 4.987220447284345, + "grad_norm": 0.07707066088914871, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6244 + }, + { + "epoch": 4.988019169329074, + "grad_norm": 0.7231665849685669, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6245 + }, + { + "epoch": 4.988817891373802, + "grad_norm": 0.0678652748465538, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6246 + }, + { + "epoch": 4.98961661341853, + "grad_norm": 3.667872905731201, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6247 + }, + { + "epoch": 4.9904153354632586, + "grad_norm": 0.2416938990354538, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6248 + }, + { + "epoch": 4.991214057507987, + "grad_norm": 0.27054834365844727, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6249 + }, + { + "epoch": 4.992012779552716, + "grad_norm": 0.1435888707637787, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6250 + }, + { + "epoch": 4.992811501597444, + "grad_norm": 0.1542683094739914, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6251 + }, + { + "epoch": 4.993610223642173, + "grad_norm": 0.1867702603340149, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6252 + }, + { + "epoch": 4.994408945686901, + "grad_norm": 0.09558507800102234, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6253 + }, + { + "epoch": 4.99520766773163, + "grad_norm": 0.3019699156284332, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6254 + }, + { + "epoch": 4.996006389776358, + "grad_norm": 0.11987117677927017, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6255 + }, + { + "epoch": 4.996805111821086, + "grad_norm": 0.11792664974927902, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6256 + }, + { + "epoch": 4.997603833865814, + "grad_norm": 0.15580247342586517, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6257 + }, + { + "epoch": 4.998402555910543, + "grad_norm": 0.20167642831802368, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6258 + }, + { + "epoch": 4.9992012779552715, + "grad_norm": 0.11203871667385101, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6259 + }, + { + "epoch": 5.0, + "grad_norm": 0.11081275343894958, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6260 + }, + { + "epoch": 5.0007987220447285, + "grad_norm": 0.11213719099760056, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6261 + }, + { + "epoch": 5.001597444089457, + "grad_norm": 0.11074960231781006, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6262 + }, + { + "epoch": 5.002396166134186, + "grad_norm": 0.07538039237260818, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6263 + }, + { + "epoch": 5.003194888178914, + "grad_norm": 0.0824185386300087, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6264 + }, + { + "epoch": 5.003993610223642, + "grad_norm": 0.08940225094556808, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6265 + }, + { + "epoch": 5.00479233226837, + "grad_norm": 0.07072590291500092, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6266 + }, + { + "epoch": 5.005591054313099, + "grad_norm": 0.13027220964431763, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6267 + }, + { + "epoch": 5.006389776357827, + "grad_norm": 0.09226793050765991, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6268 + }, + { + "epoch": 5.007188498402556, + "grad_norm": 0.1879013329744339, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6269 + }, + { + "epoch": 5.007987220447284, + "grad_norm": 0.09063144028186798, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6270 + }, + { + "epoch": 5.008785942492013, + "grad_norm": 0.09013621509075165, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6271 + }, + { + "epoch": 5.0095846645367414, + "grad_norm": 0.2404542863368988, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6272 + }, + { + "epoch": 5.01038338658147, + "grad_norm": 0.11968059092760086, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6273 + }, + { + "epoch": 5.0111821086261985, + "grad_norm": 0.16429072618484497, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6274 + }, + { + "epoch": 5.011980830670926, + "grad_norm": 0.08745420724153519, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6275 + }, + { + "epoch": 5.012779552715655, + "grad_norm": 0.09130390733480453, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6276 + }, + { + "epoch": 5.013578274760383, + "grad_norm": 0.06996344774961472, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6277 + }, + { + "epoch": 5.014376996805112, + "grad_norm": 0.06063826382160187, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6278 + }, + { + "epoch": 5.01517571884984, + "grad_norm": 0.14752542972564697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6279 + }, + { + "epoch": 5.015974440894569, + "grad_norm": 0.05987429618835449, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6280 + }, + { + "epoch": 5.016773162939297, + "grad_norm": 0.1716211587190628, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6281 + }, + { + "epoch": 5.017571884984026, + "grad_norm": 0.13823190331459045, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6282 + }, + { + "epoch": 5.018370607028754, + "grad_norm": 0.09764201194047928, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6283 + }, + { + "epoch": 5.019169329073482, + "grad_norm": 0.07897874712944031, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6284 + }, + { + "epoch": 5.0199680511182105, + "grad_norm": 0.07823392748832703, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6285 + }, + { + "epoch": 5.020766773162939, + "grad_norm": 0.1033136323094368, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6286 + }, + { + "epoch": 5.021565495207668, + "grad_norm": 0.07100827991962433, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6287 + }, + { + "epoch": 5.022364217252396, + "grad_norm": 0.40211987495422363, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 6288 + }, + { + "epoch": 5.023162939297125, + "grad_norm": 0.15459896624088287, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6289 + }, + { + "epoch": 5.023961661341853, + "grad_norm": 0.07789050787687302, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6290 + }, + { + "epoch": 5.024760383386582, + "grad_norm": 0.2116134762763977, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6291 + }, + { + "epoch": 5.02555910543131, + "grad_norm": 0.1842123568058014, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6292 + }, + { + "epoch": 5.026357827476039, + "grad_norm": 0.2037680447101593, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6293 + }, + { + "epoch": 5.027156549520766, + "grad_norm": 0.10851238667964935, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6294 + }, + { + "epoch": 5.027955271565495, + "grad_norm": 0.14465196430683136, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6295 + }, + { + "epoch": 5.0287539936102235, + "grad_norm": 0.11993128806352615, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6296 + }, + { + "epoch": 5.029552715654952, + "grad_norm": 0.13647349178791046, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6297 + }, + { + "epoch": 5.0303514376996805, + "grad_norm": 0.11265698075294495, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 6298 + }, + { + "epoch": 5.031150159744409, + "grad_norm": 18.601808547973633, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6299 + }, + { + "epoch": 5.031948881789138, + "grad_norm": 0.40079689025878906, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6300 + }, + { + "epoch": 5.032747603833866, + "grad_norm": 3.513967752456665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6301 + }, + { + "epoch": 5.033546325878595, + "grad_norm": 24.040191650390625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6302 + }, + { + "epoch": 5.034345047923322, + "grad_norm": 0.7786405086517334, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6303 + }, + { + "epoch": 5.035143769968051, + "grad_norm": 0.619868814945221, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6304 + }, + { + "epoch": 5.035942492012779, + "grad_norm": 6.039219379425049, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6305 + }, + { + "epoch": 5.036741214057508, + "grad_norm": 23.90920639038086, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 6306 + }, + { + "epoch": 5.037539936102236, + "grad_norm": 1.296809196472168, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 6307 + }, + { + "epoch": 5.038338658146965, + "grad_norm": 0.7673514485359192, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 6308 + }, + { + "epoch": 5.039137380191693, + "grad_norm": 0.5065979957580566, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 6309 + }, + { + "epoch": 5.039936102236422, + "grad_norm": 0.3858639597892761, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 6310 + }, + { + "epoch": 5.0407348242811505, + "grad_norm": 0.2647075653076172, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 6311 + }, + { + "epoch": 5.041533546325879, + "grad_norm": 0.2713094651699066, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 6312 + }, + { + "epoch": 5.042332268370607, + "grad_norm": 0.2573802173137665, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 6313 + }, + { + "epoch": 5.043130990415335, + "grad_norm": 0.2083175778388977, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 6314 + }, + { + "epoch": 5.043929712460064, + "grad_norm": 0.3625626564025879, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 6315 + }, + { + "epoch": 5.044728434504792, + "grad_norm": 0.331129789352417, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 6316 + }, + { + "epoch": 5.045527156549521, + "grad_norm": 0.23352555930614471, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 6317 + }, + { + "epoch": 5.046325878594249, + "grad_norm": 0.24043256044387817, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 6318 + }, + { + "epoch": 5.047124600638978, + "grad_norm": 0.31510207056999207, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 6319 + }, + { + "epoch": 5.047923322683706, + "grad_norm": 0.6896952390670776, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 6320 + }, + { + "epoch": 5.048722044728435, + "grad_norm": 0.7915457487106323, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 6321 + }, + { + "epoch": 5.0495207667731625, + "grad_norm": 0.2959117889404297, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 6322 + }, + { + "epoch": 5.050319488817891, + "grad_norm": 0.44844529032707214, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 6323 + }, + { + "epoch": 5.05111821086262, + "grad_norm": 0.3385697305202484, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 6324 + }, + { + "epoch": 5.051916932907348, + "grad_norm": 0.31220802664756775, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 6325 + }, + { + "epoch": 5.052715654952077, + "grad_norm": 0.3420731723308563, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 6326 + }, + { + "epoch": 5.053514376996805, + "grad_norm": 0.3061322569847107, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 6327 + }, + { + "epoch": 5.054313099041534, + "grad_norm": 0.6878030300140381, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 6328 + }, + { + "epoch": 5.055111821086262, + "grad_norm": 0.1927136927843094, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 6329 + }, + { + "epoch": 5.055910543130991, + "grad_norm": 0.24812163412570953, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6330 + }, + { + "epoch": 5.056709265175719, + "grad_norm": 0.19675321877002716, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6331 + }, + { + "epoch": 5.057507987220447, + "grad_norm": 0.20720984041690826, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6332 + }, + { + "epoch": 5.0583067092651754, + "grad_norm": 0.1260477900505066, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6333 + }, + { + "epoch": 5.059105431309904, + "grad_norm": 0.24399158358573914, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6334 + }, + { + "epoch": 5.0599041533546325, + "grad_norm": 0.22406993806362152, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6335 + }, + { + "epoch": 5.060702875399361, + "grad_norm": 0.24807684123516083, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6336 + }, + { + "epoch": 5.06150159744409, + "grad_norm": 0.1272616684436798, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6337 + }, + { + "epoch": 5.062300319488818, + "grad_norm": 0.2053418755531311, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6338 + }, + { + "epoch": 5.063099041533547, + "grad_norm": 0.13628287613391876, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6339 + }, + { + "epoch": 5.063897763578275, + "grad_norm": 0.21262522041797638, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6340 + }, + { + "epoch": 5.064696485623003, + "grad_norm": 0.3784351646900177, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6341 + }, + { + "epoch": 5.065495207667731, + "grad_norm": 0.3282131552696228, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6342 + }, + { + "epoch": 5.06629392971246, + "grad_norm": 0.10128312557935715, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6343 + }, + { + "epoch": 5.067092651757188, + "grad_norm": 0.2297000139951706, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6344 + }, + { + "epoch": 5.067891373801917, + "grad_norm": 0.11327458173036575, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6345 + }, + { + "epoch": 5.068690095846645, + "grad_norm": 0.16150346398353577, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6346 + }, + { + "epoch": 5.069488817891374, + "grad_norm": 0.15486986935138702, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6347 + }, + { + "epoch": 5.0702875399361025, + "grad_norm": 0.12427826225757599, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6348 + }, + { + "epoch": 5.071086261980831, + "grad_norm": 0.11321424692869186, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6349 + }, + { + "epoch": 5.0718849840255595, + "grad_norm": 0.12668851017951965, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6350 + }, + { + "epoch": 5.072683706070287, + "grad_norm": 0.20059579610824585, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6351 + }, + { + "epoch": 5.073482428115016, + "grad_norm": 0.14591605961322784, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6352 + }, + { + "epoch": 5.074281150159744, + "grad_norm": 0.19168664515018463, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6353 + }, + { + "epoch": 5.075079872204473, + "grad_norm": 0.19381079077720642, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6354 + }, + { + "epoch": 5.075878594249201, + "grad_norm": 0.0957496389746666, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6355 + }, + { + "epoch": 5.07667731629393, + "grad_norm": 0.11414145678281784, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6356 + }, + { + "epoch": 5.077476038338658, + "grad_norm": 0.10855124145746231, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6357 + }, + { + "epoch": 5.078274760383387, + "grad_norm": 0.2300068736076355, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6358 + }, + { + "epoch": 5.079073482428115, + "grad_norm": 0.15098270773887634, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 6359 + }, + { + "epoch": 5.079872204472843, + "grad_norm": 0.09821227937936783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6360 + }, + { + "epoch": 5.080670926517572, + "grad_norm": 0.135583758354187, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6361 + }, + { + "epoch": 5.0814696485623, + "grad_norm": 0.07262608408927917, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6362 + }, + { + "epoch": 5.082268370607029, + "grad_norm": 0.10731761902570724, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6363 + }, + { + "epoch": 5.083067092651757, + "grad_norm": 0.27508556842803955, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6364 + }, + { + "epoch": 5.083865814696486, + "grad_norm": 0.12996995449066162, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6365 + }, + { + "epoch": 5.084664536741214, + "grad_norm": 0.10386788845062256, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6366 + }, + { + "epoch": 5.085463258785943, + "grad_norm": 0.07591816782951355, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6367 + }, + { + "epoch": 5.086261980830671, + "grad_norm": 0.09341761469841003, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6368 + }, + { + "epoch": 5.0870607028754, + "grad_norm": 0.12575088441371918, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6369 + }, + { + "epoch": 5.087859424920127, + "grad_norm": 0.3423956036567688, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 6370 + }, + { + "epoch": 5.088658146964856, + "grad_norm": 0.2154775857925415, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6371 + }, + { + "epoch": 5.0894568690095845, + "grad_norm": 0.1550479382276535, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6372 + }, + { + "epoch": 5.090255591054313, + "grad_norm": 0.08802525699138641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6373 + }, + { + "epoch": 5.0910543130990416, + "grad_norm": 0.08421735465526581, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6374 + }, + { + "epoch": 5.09185303514377, + "grad_norm": 0.08920808881521225, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6375 + }, + { + "epoch": 5.092651757188499, + "grad_norm": 0.1450507938861847, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6376 + }, + { + "epoch": 5.093450479233227, + "grad_norm": 0.16926947236061096, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 6377 + }, + { + "epoch": 5.094249201277956, + "grad_norm": 0.6995428204536438, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6378 + }, + { + "epoch": 5.095047923322683, + "grad_norm": 0.10353969782590866, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6379 + }, + { + "epoch": 5.095846645367412, + "grad_norm": 0.09132180362939835, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6380 + }, + { + "epoch": 5.09664536741214, + "grad_norm": 0.17745476961135864, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6381 + }, + { + "epoch": 5.097444089456869, + "grad_norm": 0.10596930980682373, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6382 + }, + { + "epoch": 5.098242811501597, + "grad_norm": 0.11676348745822906, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6383 + }, + { + "epoch": 5.099041533546326, + "grad_norm": 0.13022664189338684, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6384 + }, + { + "epoch": 5.0998402555910545, + "grad_norm": 0.11169753223657608, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6385 + }, + { + "epoch": 5.100638977635783, + "grad_norm": 0.07439867407083511, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6386 + }, + { + "epoch": 5.1014376996805115, + "grad_norm": 0.06953777372837067, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6387 + }, + { + "epoch": 5.102236421725239, + "grad_norm": 0.09419669955968857, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6388 + }, + { + "epoch": 5.103035143769968, + "grad_norm": 0.1166587546467781, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6389 + }, + { + "epoch": 5.103833865814696, + "grad_norm": 0.5776185393333435, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6390 + }, + { + "epoch": 5.104632587859425, + "grad_norm": 0.13175810873508453, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6391 + }, + { + "epoch": 5.105431309904153, + "grad_norm": 0.09372890740633011, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6392 + }, + { + "epoch": 5.106230031948882, + "grad_norm": 0.25262513756752014, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6393 + }, + { + "epoch": 5.10702875399361, + "grad_norm": 0.1348644196987152, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6394 + }, + { + "epoch": 5.107827476038339, + "grad_norm": 0.23879335820674896, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6395 + }, + { + "epoch": 5.108626198083067, + "grad_norm": 0.25561729073524475, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6396 + }, + { + "epoch": 5.109424920127796, + "grad_norm": 0.26974916458129883, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6397 + }, + { + "epoch": 5.110223642172524, + "grad_norm": 0.1866329163312912, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6398 + }, + { + "epoch": 5.111022364217252, + "grad_norm": 0.22104737162590027, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6399 + }, + { + "epoch": 5.111821086261981, + "grad_norm": 0.3775753676891327, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6400 + }, + { + "epoch": 5.112619808306709, + "grad_norm": 0.20636002719402313, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6401 + }, + { + "epoch": 5.113418530351438, + "grad_norm": 0.1941772699356079, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6402 + }, + { + "epoch": 5.114217252396166, + "grad_norm": 0.14595480263233185, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6403 + }, + { + "epoch": 5.115015974440895, + "grad_norm": 0.16794493794441223, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6404 + }, + { + "epoch": 5.115814696485623, + "grad_norm": 0.16466112434864044, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6405 + }, + { + "epoch": 5.116613418530352, + "grad_norm": 0.27192312479019165, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6406 + }, + { + "epoch": 5.11741214057508, + "grad_norm": 0.296017050743103, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6407 + }, + { + "epoch": 5.118210862619808, + "grad_norm": 0.24947655200958252, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6408 + }, + { + "epoch": 5.1190095846645365, + "grad_norm": 0.07843278348445892, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6409 + }, + { + "epoch": 5.119808306709265, + "grad_norm": 0.2507891356945038, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6410 + }, + { + "epoch": 5.1206070287539935, + "grad_norm": 0.2962022125720978, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6411 + }, + { + "epoch": 5.121405750798722, + "grad_norm": 0.21588601171970367, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6412 + }, + { + "epoch": 5.122204472843451, + "grad_norm": 0.27223092317581177, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6413 + }, + { + "epoch": 5.123003194888179, + "grad_norm": 0.1475650519132614, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6414 + }, + { + "epoch": 5.123801916932908, + "grad_norm": 0.2624805271625519, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6415 + }, + { + "epoch": 5.124600638977636, + "grad_norm": 0.27691081166267395, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6416 + }, + { + "epoch": 5.125399361022364, + "grad_norm": 0.1828494369983673, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6417 + }, + { + "epoch": 5.126198083067092, + "grad_norm": 0.27542614936828613, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 6418 + }, + { + "epoch": 5.126996805111821, + "grad_norm": 0.16250371932983398, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6419 + }, + { + "epoch": 5.127795527156549, + "grad_norm": 0.17180733382701874, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6420 + }, + { + "epoch": 5.128594249201278, + "grad_norm": 0.21466004848480225, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6421 + }, + { + "epoch": 5.1293929712460065, + "grad_norm": 0.13144539296627045, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6422 + }, + { + "epoch": 5.130191693290735, + "grad_norm": 0.158688023686409, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6423 + }, + { + "epoch": 5.1309904153354635, + "grad_norm": 0.1430175006389618, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6424 + }, + { + "epoch": 5.131789137380192, + "grad_norm": 0.0988554134964943, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6425 + }, + { + "epoch": 5.13258785942492, + "grad_norm": 0.18320757150650024, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6426 + }, + { + "epoch": 5.133386581469648, + "grad_norm": 0.34172165393829346, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6427 + }, + { + "epoch": 5.134185303514377, + "grad_norm": 0.095450758934021, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6428 + }, + { + "epoch": 5.134984025559105, + "grad_norm": 0.2988479733467102, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6429 + }, + { + "epoch": 5.135782747603834, + "grad_norm": 0.11462085694074631, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6430 + }, + { + "epoch": 5.136581469648562, + "grad_norm": 0.11989153176546097, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6431 + }, + { + "epoch": 5.137380191693291, + "grad_norm": 0.15308552980422974, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6432 + }, + { + "epoch": 5.138178913738019, + "grad_norm": 0.1119944304227829, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6433 + }, + { + "epoch": 5.138977635782748, + "grad_norm": 0.38812172412872314, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6434 + }, + { + "epoch": 5.139776357827476, + "grad_norm": 0.24718649685382843, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6435 + }, + { + "epoch": 5.140575079872204, + "grad_norm": 0.15834778547286987, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6436 + }, + { + "epoch": 5.141373801916933, + "grad_norm": 0.1960451751947403, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6437 + }, + { + "epoch": 5.142172523961661, + "grad_norm": 0.16195416450500488, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6438 + }, + { + "epoch": 5.14297124600639, + "grad_norm": 0.07554367184638977, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6439 + }, + { + "epoch": 5.143769968051118, + "grad_norm": 0.18924687802791595, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6440 + }, + { + "epoch": 5.144568690095847, + "grad_norm": 0.16253480315208435, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6441 + }, + { + "epoch": 5.145367412140575, + "grad_norm": 0.12711918354034424, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6442 + }, + { + "epoch": 5.146166134185304, + "grad_norm": 0.16831086575984955, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6443 + }, + { + "epoch": 5.146964856230032, + "grad_norm": 0.35199087858200073, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6444 + }, + { + "epoch": 5.147763578274761, + "grad_norm": 0.1340232491493225, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6445 + }, + { + "epoch": 5.1485623003194885, + "grad_norm": 0.1397274285554886, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6446 + }, + { + "epoch": 5.149361022364217, + "grad_norm": 0.13868366181850433, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6447 + }, + { + "epoch": 5.1501597444089455, + "grad_norm": 0.08846192806959152, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6448 + }, + { + "epoch": 5.150958466453674, + "grad_norm": 0.08350610733032227, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6449 + }, + { + "epoch": 5.151757188498403, + "grad_norm": 0.14727875590324402, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6450 + }, + { + "epoch": 5.152555910543131, + "grad_norm": 0.11705708503723145, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6451 + }, + { + "epoch": 5.15335463258786, + "grad_norm": 0.10308192670345306, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6452 + }, + { + "epoch": 5.154153354632588, + "grad_norm": 0.09459209442138672, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6453 + }, + { + "epoch": 5.154952076677317, + "grad_norm": 0.11605191230773926, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6454 + }, + { + "epoch": 5.155750798722044, + "grad_norm": 0.24275821447372437, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6455 + }, + { + "epoch": 5.156549520766773, + "grad_norm": 0.208640456199646, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6456 + }, + { + "epoch": 5.157348242811501, + "grad_norm": 0.15257662534713745, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6457 + }, + { + "epoch": 5.15814696485623, + "grad_norm": 0.10431355237960815, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6458 + }, + { + "epoch": 5.1589456869009584, + "grad_norm": 0.14187589287757874, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6459 + }, + { + "epoch": 5.159744408945687, + "grad_norm": 0.19084404408931732, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6460 + }, + { + "epoch": 5.1605431309904155, + "grad_norm": 0.09255128353834152, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6461 + }, + { + "epoch": 5.161341853035144, + "grad_norm": 0.1443471759557724, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6462 + }, + { + "epoch": 5.162140575079873, + "grad_norm": 0.36597245931625366, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6463 + }, + { + "epoch": 5.1629392971246, + "grad_norm": 0.3835389316082001, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6464 + }, + { + "epoch": 5.163738019169329, + "grad_norm": 0.14208771288394928, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6465 + }, + { + "epoch": 5.164536741214057, + "grad_norm": 0.2520706355571747, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6466 + }, + { + "epoch": 5.165335463258786, + "grad_norm": 0.2595224976539612, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6467 + }, + { + "epoch": 5.166134185303514, + "grad_norm": 0.15721063315868378, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6468 + }, + { + "epoch": 5.166932907348243, + "grad_norm": 0.1772007793188095, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6469 + }, + { + "epoch": 5.167731629392971, + "grad_norm": 0.19899888336658478, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6470 + }, + { + "epoch": 5.1685303514377, + "grad_norm": 0.18689346313476562, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6471 + }, + { + "epoch": 5.169329073482428, + "grad_norm": 0.16748468577861786, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6472 + }, + { + "epoch": 5.170127795527157, + "grad_norm": 0.13296879827976227, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6473 + }, + { + "epoch": 5.170926517571885, + "grad_norm": 0.18742166459560394, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6474 + }, + { + "epoch": 5.171725239616613, + "grad_norm": 0.17811308801174164, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6475 + }, + { + "epoch": 5.172523961661342, + "grad_norm": 0.1360485702753067, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 6476 + }, + { + "epoch": 5.17332268370607, + "grad_norm": 0.13431121408939362, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6477 + }, + { + "epoch": 5.174121405750799, + "grad_norm": 0.12888069450855255, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6478 + }, + { + "epoch": 5.174920127795527, + "grad_norm": 0.15194712579250336, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6479 + }, + { + "epoch": 5.175718849840256, + "grad_norm": 0.13076889514923096, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6480 + }, + { + "epoch": 5.176517571884984, + "grad_norm": 0.14751110970973969, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6481 + }, + { + "epoch": 5.177316293929713, + "grad_norm": 0.11919333785772324, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6482 + }, + { + "epoch": 5.178115015974441, + "grad_norm": 0.12712688744068146, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6483 + }, + { + "epoch": 5.178913738019169, + "grad_norm": 0.13765369355678558, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 6484 + }, + { + "epoch": 5.1797124600638975, + "grad_norm": 0.11060373485088348, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6485 + }, + { + "epoch": 5.180511182108626, + "grad_norm": 0.056882213801145554, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6486 + }, + { + "epoch": 5.181309904153355, + "grad_norm": 0.11317770928144455, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6487 + }, + { + "epoch": 5.182108626198083, + "grad_norm": 0.09279809147119522, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6488 + }, + { + "epoch": 5.182907348242812, + "grad_norm": 0.09392786771059036, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6489 + }, + { + "epoch": 5.18370607028754, + "grad_norm": 0.13042815029621124, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6490 + }, + { + "epoch": 5.184504792332269, + "grad_norm": 0.07929978519678116, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6491 + }, + { + "epoch": 5.185303514376997, + "grad_norm": 0.12215851992368698, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6492 + }, + { + "epoch": 5.186102236421725, + "grad_norm": 0.12000773102045059, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6493 + }, + { + "epoch": 5.186900958466453, + "grad_norm": 0.08427707850933075, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6494 + }, + { + "epoch": 5.187699680511182, + "grad_norm": 0.158653125166893, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6495 + }, + { + "epoch": 5.18849840255591, + "grad_norm": 0.11087878793478012, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6496 + }, + { + "epoch": 5.189297124600639, + "grad_norm": 0.12649668753147125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6497 + }, + { + "epoch": 5.1900958466453675, + "grad_norm": 0.0821281224489212, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6498 + }, + { + "epoch": 5.190894568690096, + "grad_norm": 0.07192671298980713, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6499 + }, + { + "epoch": 5.1916932907348246, + "grad_norm": 0.10505214333534241, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6500 + }, + { + "epoch": 5.192492012779553, + "grad_norm": 0.11772353947162628, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6501 + }, + { + "epoch": 5.193290734824281, + "grad_norm": 0.15557901561260223, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6502 + }, + { + "epoch": 5.194089456869009, + "grad_norm": 0.09753020852804184, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6503 + }, + { + "epoch": 5.194888178913738, + "grad_norm": 0.10331830382347107, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6504 + }, + { + "epoch": 5.195686900958466, + "grad_norm": 0.130085289478302, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6505 + }, + { + "epoch": 5.196485623003195, + "grad_norm": 0.08772018551826477, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6506 + }, + { + "epoch": 5.197284345047923, + "grad_norm": 0.1906667798757553, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6507 + }, + { + "epoch": 5.198083067092652, + "grad_norm": 0.06724394112825394, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6508 + }, + { + "epoch": 5.19888178913738, + "grad_norm": 0.1141325905919075, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6509 + }, + { + "epoch": 5.199680511182109, + "grad_norm": 0.08354665338993073, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6510 + }, + { + "epoch": 5.2004792332268375, + "grad_norm": 0.1072440817952156, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6511 + }, + { + "epoch": 5.201277955271565, + "grad_norm": 0.10670839250087738, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6512 + }, + { + "epoch": 5.202076677316294, + "grad_norm": 0.10079781711101532, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6513 + }, + { + "epoch": 5.202875399361022, + "grad_norm": 0.1281125396490097, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6514 + }, + { + "epoch": 5.203674121405751, + "grad_norm": 0.1627720147371292, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6515 + }, + { + "epoch": 5.204472843450479, + "grad_norm": 0.1507575958967209, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6516 + }, + { + "epoch": 5.205271565495208, + "grad_norm": 0.17764779925346375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6517 + }, + { + "epoch": 5.206070287539936, + "grad_norm": 0.1825307011604309, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6518 + }, + { + "epoch": 5.206869009584665, + "grad_norm": 0.1151907742023468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6519 + }, + { + "epoch": 5.207667731629393, + "grad_norm": 0.1425708830356598, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6520 + }, + { + "epoch": 5.208466453674121, + "grad_norm": 0.08555550873279572, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6521 + }, + { + "epoch": 5.2092651757188495, + "grad_norm": 0.15400084853172302, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6522 + }, + { + "epoch": 5.210063897763578, + "grad_norm": 0.11088921129703522, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6523 + }, + { + "epoch": 5.210862619808307, + "grad_norm": 0.0959518551826477, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6524 + }, + { + "epoch": 5.211661341853035, + "grad_norm": 0.1054866686463356, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6525 + }, + { + "epoch": 5.212460063897764, + "grad_norm": 0.17849107086658478, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6526 + }, + { + "epoch": 5.213258785942492, + "grad_norm": 0.0910423994064331, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6527 + }, + { + "epoch": 5.214057507987221, + "grad_norm": 0.10857872664928436, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6528 + }, + { + "epoch": 5.214856230031949, + "grad_norm": 0.09012399613857269, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6529 + }, + { + "epoch": 5.215654952076678, + "grad_norm": 0.14724178612232208, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6530 + }, + { + "epoch": 5.216453674121405, + "grad_norm": 0.11357409507036209, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6531 + }, + { + "epoch": 5.217252396166134, + "grad_norm": 0.09721364825963974, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6532 + }, + { + "epoch": 5.218051118210862, + "grad_norm": 0.07837430387735367, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6533 + }, + { + "epoch": 5.218849840255591, + "grad_norm": 0.1181735098361969, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6534 + }, + { + "epoch": 5.2196485623003195, + "grad_norm": 0.07066017389297485, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6535 + }, + { + "epoch": 5.220447284345048, + "grad_norm": 0.06838417053222656, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6536 + }, + { + "epoch": 5.2212460063897765, + "grad_norm": 0.0919245257973671, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6537 + }, + { + "epoch": 5.222044728434505, + "grad_norm": 0.06859984248876572, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6538 + }, + { + "epoch": 5.222843450479234, + "grad_norm": 1.929213523864746, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6539 + }, + { + "epoch": 5.223642172523961, + "grad_norm": 0.11181562393903732, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6540 + }, + { + "epoch": 5.22444089456869, + "grad_norm": 0.09261998534202576, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6541 + }, + { + "epoch": 5.225239616613418, + "grad_norm": 0.11214403063058853, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6542 + }, + { + "epoch": 5.226038338658147, + "grad_norm": 0.1353820264339447, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6543 + }, + { + "epoch": 5.226837060702875, + "grad_norm": 0.11579953879117966, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 6544 + }, + { + "epoch": 5.227635782747604, + "grad_norm": 0.08284885436296463, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6545 + }, + { + "epoch": 5.228434504792332, + "grad_norm": 0.13805733621120453, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6546 + }, + { + "epoch": 5.229233226837061, + "grad_norm": 0.08924185484647751, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6547 + }, + { + "epoch": 5.2300319488817895, + "grad_norm": 0.10975285619497299, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6548 + }, + { + "epoch": 5.230830670926518, + "grad_norm": 0.10500271618366241, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6549 + }, + { + "epoch": 5.231629392971246, + "grad_norm": 0.09947814792394638, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 5.232428115015974, + "grad_norm": 0.10113594681024551, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6551 + }, + { + "epoch": 5.233226837060703, + "grad_norm": 0.12645265460014343, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6552 + }, + { + "epoch": 5.234025559105431, + "grad_norm": 0.06775741279125214, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6553 + }, + { + "epoch": 5.23482428115016, + "grad_norm": 0.09799529612064362, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6554 + }, + { + "epoch": 5.235623003194888, + "grad_norm": 0.13129538297653198, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6555 + }, + { + "epoch": 5.236421725239617, + "grad_norm": 0.10139735788106918, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6556 + }, + { + "epoch": 5.237220447284345, + "grad_norm": 0.13819058239459991, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6557 + }, + { + "epoch": 5.238019169329074, + "grad_norm": 0.09306512027978897, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6558 + }, + { + "epoch": 5.2388178913738015, + "grad_norm": 0.07963602244853973, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6559 + }, + { + "epoch": 5.23961661341853, + "grad_norm": 0.12864448130130768, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6560 + }, + { + "epoch": 5.2404153354632586, + "grad_norm": 0.1044403612613678, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6561 + }, + { + "epoch": 5.241214057507987, + "grad_norm": 0.07623843848705292, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6562 + }, + { + "epoch": 5.242012779552716, + "grad_norm": 0.10385097563266754, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6563 + }, + { + "epoch": 5.242811501597444, + "grad_norm": 0.07048188149929047, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6564 + }, + { + "epoch": 5.243610223642173, + "grad_norm": 0.25789955258369446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6565 + }, + { + "epoch": 5.244408945686901, + "grad_norm": 0.12271685153245926, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6566 + }, + { + "epoch": 5.24520766773163, + "grad_norm": 0.10512058436870575, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6567 + }, + { + "epoch": 5.246006389776358, + "grad_norm": 0.07663438469171524, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6568 + }, + { + "epoch": 5.246805111821086, + "grad_norm": 0.09937599301338196, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6569 + }, + { + "epoch": 5.247603833865814, + "grad_norm": 0.12242338061332703, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6570 + }, + { + "epoch": 5.248402555910543, + "grad_norm": 0.1733475625514984, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6571 + }, + { + "epoch": 5.2492012779552715, + "grad_norm": 0.1460944414138794, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6572 + }, + { + "epoch": 5.25, + "grad_norm": 0.09406521171331406, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6573 + }, + { + "epoch": 5.2507987220447285, + "grad_norm": 1.0146688222885132, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6574 + }, + { + "epoch": 5.251597444089457, + "grad_norm": 0.10557705909013748, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6575 + }, + { + "epoch": 5.252396166134186, + "grad_norm": 0.1306990385055542, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6576 + }, + { + "epoch": 5.253194888178914, + "grad_norm": 0.094961017370224, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6577 + }, + { + "epoch": 5.253993610223642, + "grad_norm": 0.13421863317489624, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6578 + }, + { + "epoch": 5.25479233226837, + "grad_norm": 0.12371776252985, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6579 + }, + { + "epoch": 5.255591054313099, + "grad_norm": 0.15863509476184845, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6580 + }, + { + "epoch": 5.256389776357827, + "grad_norm": 0.1156599149107933, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6581 + }, + { + "epoch": 5.257188498402556, + "grad_norm": 0.07102219015359879, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6582 + }, + { + "epoch": 5.257987220447284, + "grad_norm": 0.09030039608478546, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6583 + }, + { + "epoch": 5.258785942492013, + "grad_norm": 0.08848102390766144, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6584 + }, + { + "epoch": 5.2595846645367414, + "grad_norm": 0.07455430924892426, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6585 + }, + { + "epoch": 5.26038338658147, + "grad_norm": 0.07729559391736984, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6586 + }, + { + "epoch": 5.261182108626198, + "grad_norm": 0.0955357626080513, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6587 + }, + { + "epoch": 5.261980830670926, + "grad_norm": 0.08680911362171173, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6588 + }, + { + "epoch": 5.262779552715655, + "grad_norm": 0.1033414825797081, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6589 + }, + { + "epoch": 5.263578274760383, + "grad_norm": 0.09428979456424713, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6590 + }, + { + "epoch": 5.264376996805112, + "grad_norm": 0.07567942887544632, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6591 + }, + { + "epoch": 5.26517571884984, + "grad_norm": 0.221647247672081, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6592 + }, + { + "epoch": 5.265974440894569, + "grad_norm": 0.13839758932590485, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6593 + }, + { + "epoch": 5.266773162939297, + "grad_norm": 0.06060291454195976, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6594 + }, + { + "epoch": 5.267571884984026, + "grad_norm": 0.09146185964345932, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6595 + }, + { + "epoch": 5.268370607028754, + "grad_norm": 0.05557526275515556, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6596 + }, + { + "epoch": 5.269169329073483, + "grad_norm": 0.10190495103597641, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6597 + }, + { + "epoch": 5.2699680511182105, + "grad_norm": 0.07389659434556961, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6598 + }, + { + "epoch": 5.270766773162939, + "grad_norm": 0.11124115437269211, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6599 + }, + { + "epoch": 5.271565495207668, + "grad_norm": 0.10779515653848648, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6600 + }, + { + "epoch": 5.272364217252396, + "grad_norm": 0.09347773343324661, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6601 + }, + { + "epoch": 5.273162939297125, + "grad_norm": 0.15056683123111725, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6602 + }, + { + "epoch": 5.273961661341853, + "grad_norm": 0.1398572027683258, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6603 + }, + { + "epoch": 5.274760383386582, + "grad_norm": 0.08360682427883148, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6604 + }, + { + "epoch": 5.27555910543131, + "grad_norm": 0.10360747575759888, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6605 + }, + { + "epoch": 5.276357827476039, + "grad_norm": 0.0864897072315216, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6606 + }, + { + "epoch": 5.277156549520766, + "grad_norm": 0.11505412310361862, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6607 + }, + { + "epoch": 5.277955271565495, + "grad_norm": 0.10638110339641571, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6608 + }, + { + "epoch": 5.2787539936102235, + "grad_norm": 0.08349479734897614, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6609 + }, + { + "epoch": 5.279552715654952, + "grad_norm": 0.14465951919555664, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6610 + }, + { + "epoch": 5.2803514376996805, + "grad_norm": 0.08049577474594116, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6611 + }, + { + "epoch": 5.281150159744409, + "grad_norm": 0.10206092149019241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6612 + }, + { + "epoch": 5.281948881789138, + "grad_norm": 0.2721571922302246, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6613 + }, + { + "epoch": 5.282747603833866, + "grad_norm": 0.17503346502780914, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6614 + }, + { + "epoch": 5.283546325878595, + "grad_norm": 0.11459292471408844, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6615 + }, + { + "epoch": 5.284345047923322, + "grad_norm": 0.9974967241287231, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6616 + }, + { + "epoch": 5.285143769968051, + "grad_norm": 0.11502816528081894, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6617 + }, + { + "epoch": 5.285942492012779, + "grad_norm": 0.12992256879806519, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6618 + }, + { + "epoch": 5.286741214057508, + "grad_norm": 0.19872024655342102, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6619 + }, + { + "epoch": 5.287539936102236, + "grad_norm": 0.13013097643852234, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6620 + }, + { + "epoch": 5.288338658146965, + "grad_norm": 0.13644525408744812, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6621 + }, + { + "epoch": 5.289137380191693, + "grad_norm": 0.15101996064186096, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6622 + }, + { + "epoch": 5.289936102236422, + "grad_norm": 0.11075131595134735, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6623 + }, + { + "epoch": 5.2907348242811505, + "grad_norm": 0.0904511958360672, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6624 + }, + { + "epoch": 5.291533546325878, + "grad_norm": 0.08861460536718369, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6625 + }, + { + "epoch": 5.292332268370607, + "grad_norm": 0.10443824529647827, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6626 + }, + { + "epoch": 5.293130990415335, + "grad_norm": 0.07440674304962158, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6627 + }, + { + "epoch": 5.293929712460064, + "grad_norm": 0.21709975600242615, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6628 + }, + { + "epoch": 5.294728434504792, + "grad_norm": 0.1281055063009262, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6629 + }, + { + "epoch": 5.295527156549521, + "grad_norm": 0.10365202277898788, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6630 + }, + { + "epoch": 5.296325878594249, + "grad_norm": 1.004258632659912, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6631 + }, + { + "epoch": 5.297124600638978, + "grad_norm": 0.16660870611667633, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6632 + }, + { + "epoch": 5.297923322683706, + "grad_norm": 0.1146734207868576, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6633 + }, + { + "epoch": 5.298722044728435, + "grad_norm": 0.18288104236125946, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6634 + }, + { + "epoch": 5.2995207667731625, + "grad_norm": 0.11469347029924393, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6635 + }, + { + "epoch": 5.300319488817891, + "grad_norm": 0.1333407461643219, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6636 + }, + { + "epoch": 5.30111821086262, + "grad_norm": 0.15359243750572205, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6637 + }, + { + "epoch": 5.301916932907348, + "grad_norm": 0.0832027792930603, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6638 + }, + { + "epoch": 5.302715654952077, + "grad_norm": 0.10231718420982361, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6639 + }, + { + "epoch": 5.303514376996805, + "grad_norm": 0.11031626909971237, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6640 + }, + { + "epoch": 5.304313099041534, + "grad_norm": 0.08014792948961258, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6641 + }, + { + "epoch": 5.305111821086262, + "grad_norm": 0.10066475719213486, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6642 + }, + { + "epoch": 5.305910543130991, + "grad_norm": 0.12824396789073944, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6643 + }, + { + "epoch": 5.306709265175719, + "grad_norm": 0.09452345222234726, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6644 + }, + { + "epoch": 5.307507987220447, + "grad_norm": 0.09100557118654251, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6645 + }, + { + "epoch": 5.3083067092651754, + "grad_norm": 0.07995713502168655, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6646 + }, + { + "epoch": 5.309105431309904, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6647 + }, + { + "epoch": 5.3099041533546325, + "grad_norm": 0.09881234914064407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6648 + }, + { + "epoch": 5.310702875399361, + "grad_norm": 0.08131393790245056, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6649 + }, + { + "epoch": 5.31150159744409, + "grad_norm": 0.08842889964580536, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6650 + }, + { + "epoch": 5.312300319488818, + "grad_norm": 0.12630115449428558, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6651 + }, + { + "epoch": 5.313099041533547, + "grad_norm": 0.13429711759090424, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6652 + }, + { + "epoch": 5.313897763578275, + "grad_norm": 0.11347261816263199, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6653 + }, + { + "epoch": 5.314696485623003, + "grad_norm": 0.1555728167295456, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6654 + }, + { + "epoch": 5.315495207667731, + "grad_norm": 0.13184282183647156, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6655 + }, + { + "epoch": 5.31629392971246, + "grad_norm": 0.07821093499660492, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6656 + }, + { + "epoch": 5.317092651757188, + "grad_norm": 0.1300499588251114, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6657 + }, + { + "epoch": 5.317891373801917, + "grad_norm": 0.14896781742572784, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6658 + }, + { + "epoch": 5.318690095846645, + "grad_norm": 0.13370175659656525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6659 + }, + { + "epoch": 5.319488817891374, + "grad_norm": 0.14055652916431427, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6660 + }, + { + "epoch": 5.3202875399361025, + "grad_norm": 0.11674464493989944, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6661 + }, + { + "epoch": 5.321086261980831, + "grad_norm": 0.13155756890773773, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6662 + }, + { + "epoch": 5.321884984025559, + "grad_norm": 0.09616535156965256, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6663 + }, + { + "epoch": 5.322683706070287, + "grad_norm": 0.4228188991546631, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6664 + }, + { + "epoch": 5.323482428115016, + "grad_norm": 0.10942913591861725, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6665 + }, + { + "epoch": 5.324281150159744, + "grad_norm": 0.15592730045318604, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6666 + }, + { + "epoch": 5.325079872204473, + "grad_norm": 0.16837753355503082, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6667 + }, + { + "epoch": 5.325878594249201, + "grad_norm": 0.10512012243270874, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6668 + }, + { + "epoch": 5.32667731629393, + "grad_norm": 0.10834471136331558, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6669 + }, + { + "epoch": 5.327476038338658, + "grad_norm": 0.06588451564311981, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6670 + }, + { + "epoch": 5.328274760383387, + "grad_norm": 0.08714822679758072, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6671 + }, + { + "epoch": 5.329073482428115, + "grad_norm": 0.16129685938358307, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6672 + }, + { + "epoch": 5.329872204472843, + "grad_norm": 0.09294751286506653, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6673 + }, + { + "epoch": 5.330670926517572, + "grad_norm": 0.09905052185058594, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6674 + }, + { + "epoch": 5.3314696485623, + "grad_norm": 0.14584603905677795, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6675 + }, + { + "epoch": 5.332268370607029, + "grad_norm": 0.08384378254413605, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6676 + }, + { + "epoch": 5.333067092651757, + "grad_norm": 0.1672045886516571, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6677 + }, + { + "epoch": 5.333865814696486, + "grad_norm": 0.21656489372253418, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6678 + }, + { + "epoch": 5.334664536741214, + "grad_norm": 0.17034684121608734, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6679 + }, + { + "epoch": 5.335463258785943, + "grad_norm": 0.3153417408466339, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6680 + }, + { + "epoch": 5.336261980830671, + "grad_norm": 0.1953393816947937, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6681 + }, + { + "epoch": 5.3370607028754, + "grad_norm": 0.2085847705602646, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6682 + }, + { + "epoch": 5.337859424920127, + "grad_norm": 0.2679558992385864, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6683 + }, + { + "epoch": 5.338658146964856, + "grad_norm": 0.08705966919660568, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6684 + }, + { + "epoch": 5.3394568690095845, + "grad_norm": 0.09011410176753998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6685 + }, + { + "epoch": 5.340255591054313, + "grad_norm": 0.10358326137065887, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6686 + }, + { + "epoch": 5.3410543130990416, + "grad_norm": 0.08191518485546112, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6687 + }, + { + "epoch": 5.34185303514377, + "grad_norm": 0.0676165446639061, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6688 + }, + { + "epoch": 5.342651757188499, + "grad_norm": 0.18006695806980133, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6689 + }, + { + "epoch": 5.343450479233227, + "grad_norm": 0.11935598403215408, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.344249201277956, + "grad_norm": 0.14136075973510742, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6691 + }, + { + "epoch": 5.345047923322683, + "grad_norm": 0.19367988407611847, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6692 + }, + { + "epoch": 5.345846645367412, + "grad_norm": 0.1283622533082962, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6693 + }, + { + "epoch": 5.34664536741214, + "grad_norm": 0.11303326487541199, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6694 + }, + { + "epoch": 5.347444089456869, + "grad_norm": 0.09076731652021408, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6695 + }, + { + "epoch": 5.348242811501597, + "grad_norm": 0.12625159323215485, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6696 + }, + { + "epoch": 5.349041533546326, + "grad_norm": 0.18254370987415314, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6697 + }, + { + "epoch": 5.3498402555910545, + "grad_norm": 0.12221173942089081, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6698 + }, + { + "epoch": 5.350638977635783, + "grad_norm": 0.11586996912956238, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6699 + }, + { + "epoch": 5.3514376996805115, + "grad_norm": 0.1012619286775589, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6700 + }, + { + "epoch": 5.352236421725239, + "grad_norm": 0.10728003084659576, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6701 + }, + { + "epoch": 5.353035143769968, + "grad_norm": 0.08077894896268845, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6702 + }, + { + "epoch": 5.353833865814696, + "grad_norm": 0.10069102048873901, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6703 + }, + { + "epoch": 5.354632587859425, + "grad_norm": 0.11007717996835709, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6704 + }, + { + "epoch": 5.355431309904153, + "grad_norm": 0.08088147640228271, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6705 + }, + { + "epoch": 5.356230031948882, + "grad_norm": 0.06969337165355682, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6706 + }, + { + "epoch": 5.35702875399361, + "grad_norm": 0.09731647372245789, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6707 + }, + { + "epoch": 5.357827476038339, + "grad_norm": 0.07404995709657669, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6708 + }, + { + "epoch": 5.358626198083067, + "grad_norm": 0.09361755102872849, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6709 + }, + { + "epoch": 5.359424920127796, + "grad_norm": 0.11929210275411606, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6710 + }, + { + "epoch": 5.360223642172524, + "grad_norm": 0.11107892543077469, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6711 + }, + { + "epoch": 5.361022364217252, + "grad_norm": 0.10966535657644272, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6712 + }, + { + "epoch": 5.361821086261981, + "grad_norm": 0.11830565333366394, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6713 + }, + { + "epoch": 5.362619808306709, + "grad_norm": 0.15130563080310822, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6714 + }, + { + "epoch": 5.363418530351438, + "grad_norm": 0.12608309090137482, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6715 + }, + { + "epoch": 5.364217252396166, + "grad_norm": 0.10768693685531616, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6716 + }, + { + "epoch": 5.365015974440895, + "grad_norm": 0.10020256787538528, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6717 + }, + { + "epoch": 5.365814696485623, + "grad_norm": 0.11352406442165375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6718 + }, + { + "epoch": 5.366613418530352, + "grad_norm": 0.10058535635471344, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6719 + }, + { + "epoch": 5.36741214057508, + "grad_norm": 0.08427922427654266, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6720 + }, + { + "epoch": 5.368210862619808, + "grad_norm": 0.08600196242332458, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6721 + }, + { + "epoch": 5.3690095846645365, + "grad_norm": 0.0891844630241394, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 6722 + }, + { + "epoch": 5.369808306709265, + "grad_norm": 0.07231339812278748, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6723 + }, + { + "epoch": 5.3706070287539935, + "grad_norm": 0.0866503193974495, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6724 + }, + { + "epoch": 5.371405750798722, + "grad_norm": 0.44905656576156616, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6725 + }, + { + "epoch": 5.372204472843451, + "grad_norm": 0.2192242592573166, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6726 + }, + { + "epoch": 5.373003194888179, + "grad_norm": 0.15841859579086304, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6727 + }, + { + "epoch": 5.373801916932908, + "grad_norm": 0.1254468858242035, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6728 + }, + { + "epoch": 5.374600638977636, + "grad_norm": 1.5675911903381348, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6729 + }, + { + "epoch": 5.375399361022364, + "grad_norm": 0.20507164299488068, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6730 + }, + { + "epoch": 5.376198083067092, + "grad_norm": 0.26948630809783936, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6731 + }, + { + "epoch": 5.376996805111821, + "grad_norm": 0.15447315573692322, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6732 + }, + { + "epoch": 5.377795527156549, + "grad_norm": 0.17888243496418, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6733 + }, + { + "epoch": 5.378594249201278, + "grad_norm": 0.24683290719985962, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6734 + }, + { + "epoch": 5.3793929712460065, + "grad_norm": 0.15786881744861603, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6735 + }, + { + "epoch": 5.380191693290735, + "grad_norm": 0.18426702916622162, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6736 + }, + { + "epoch": 5.3809904153354635, + "grad_norm": 0.14444448053836823, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6737 + }, + { + "epoch": 5.381789137380192, + "grad_norm": 0.135011225938797, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6738 + }, + { + "epoch": 5.38258785942492, + "grad_norm": 0.19057826697826385, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6739 + }, + { + "epoch": 5.383386581469648, + "grad_norm": 0.12282486259937286, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6740 + }, + { + "epoch": 5.384185303514377, + "grad_norm": 0.17092294991016388, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6741 + }, + { + "epoch": 5.384984025559105, + "grad_norm": 0.19800473749637604, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6742 + }, + { + "epoch": 5.385782747603834, + "grad_norm": 0.07987766712903976, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6743 + }, + { + "epoch": 5.386581469648562, + "grad_norm": 0.18386386334896088, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6744 + }, + { + "epoch": 5.387380191693291, + "grad_norm": 0.16529197990894318, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6745 + }, + { + "epoch": 5.388178913738019, + "grad_norm": 0.09607496112585068, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6746 + }, + { + "epoch": 5.388977635782748, + "grad_norm": 0.15966713428497314, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6747 + }, + { + "epoch": 5.389776357827476, + "grad_norm": 0.1622796356678009, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6748 + }, + { + "epoch": 5.390575079872204, + "grad_norm": 0.09537432342767715, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6749 + }, + { + "epoch": 5.391373801916933, + "grad_norm": 0.1766965389251709, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6750 + }, + { + "epoch": 5.392172523961661, + "grad_norm": 0.21354711055755615, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6751 + }, + { + "epoch": 5.39297124600639, + "grad_norm": 0.093564473092556, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6752 + }, + { + "epoch": 5.393769968051118, + "grad_norm": 0.14756347239017487, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6753 + }, + { + "epoch": 5.394568690095847, + "grad_norm": 0.10537468641996384, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6754 + }, + { + "epoch": 5.395367412140575, + "grad_norm": 0.15626567602157593, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6755 + }, + { + "epoch": 5.396166134185304, + "grad_norm": 0.16282637417316437, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6756 + }, + { + "epoch": 5.396964856230032, + "grad_norm": 0.0745241791009903, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6757 + }, + { + "epoch": 5.397763578274761, + "grad_norm": 0.1221894845366478, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6758 + }, + { + "epoch": 5.3985623003194885, + "grad_norm": 0.08314131945371628, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6759 + }, + { + "epoch": 5.399361022364217, + "grad_norm": 0.12707264721393585, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6760 + }, + { + "epoch": 5.4001597444089455, + "grad_norm": 0.12036006152629852, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6761 + }, + { + "epoch": 5.400958466453674, + "grad_norm": 0.12769176065921783, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6762 + }, + { + "epoch": 5.401757188498403, + "grad_norm": 0.2201661318540573, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6763 + }, + { + "epoch": 5.402555910543131, + "grad_norm": 0.15013982355594635, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6764 + }, + { + "epoch": 5.40335463258786, + "grad_norm": 0.7714766263961792, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6765 + }, + { + "epoch": 5.404153354632588, + "grad_norm": 0.20359933376312256, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6766 + }, + { + "epoch": 5.404952076677317, + "grad_norm": 0.12684984505176544, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6767 + }, + { + "epoch": 5.405750798722044, + "grad_norm": 0.09804195165634155, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6768 + }, + { + "epoch": 5.406549520766773, + "grad_norm": 0.10416880995035172, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6769 + }, + { + "epoch": 5.407348242811501, + "grad_norm": 0.1509416699409485, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6770 + }, + { + "epoch": 5.40814696485623, + "grad_norm": 0.15458443760871887, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6771 + }, + { + "epoch": 5.4089456869009584, + "grad_norm": 0.08355830609798431, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6772 + }, + { + "epoch": 5.409744408945687, + "grad_norm": 0.1228979080915451, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6773 + }, + { + "epoch": 5.4105431309904155, + "grad_norm": 0.12139632552862167, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6774 + }, + { + "epoch": 5.411341853035144, + "grad_norm": 0.16298502683639526, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6775 + }, + { + "epoch": 5.412140575079873, + "grad_norm": 0.09110788255929947, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6776 + }, + { + "epoch": 5.4129392971246, + "grad_norm": 0.08584781736135483, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6777 + }, + { + "epoch": 5.413738019169329, + "grad_norm": 0.10148828476667404, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6778 + }, + { + "epoch": 5.414536741214057, + "grad_norm": 0.1046212688088417, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6779 + }, + { + "epoch": 5.415335463258786, + "grad_norm": 0.12530827522277832, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6780 + }, + { + "epoch": 5.416134185303514, + "grad_norm": 0.07337464392185211, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6781 + }, + { + "epoch": 5.416932907348243, + "grad_norm": 0.10839185118675232, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6782 + }, + { + "epoch": 5.417731629392971, + "grad_norm": 0.07784926891326904, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6783 + }, + { + "epoch": 5.4185303514377, + "grad_norm": 0.08692190796136856, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6784 + }, + { + "epoch": 5.419329073482428, + "grad_norm": 0.08721921592950821, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6785 + }, + { + "epoch": 5.420127795527157, + "grad_norm": 0.09581280499696732, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6786 + }, + { + "epoch": 5.420926517571885, + "grad_norm": 0.1156916618347168, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6787 + }, + { + "epoch": 5.421725239616613, + "grad_norm": 0.4520327150821686, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6788 + }, + { + "epoch": 5.422523961661342, + "grad_norm": 0.0948205217719078, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6789 + }, + { + "epoch": 5.42332268370607, + "grad_norm": 0.07208927720785141, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6790 + }, + { + "epoch": 5.424121405750799, + "grad_norm": 0.06830724328756332, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6791 + }, + { + "epoch": 5.424920127795527, + "grad_norm": 0.10488666594028473, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6792 + }, + { + "epoch": 5.425718849840256, + "grad_norm": 0.08509235084056854, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6793 + }, + { + "epoch": 5.426517571884984, + "grad_norm": 0.09133832901716232, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6794 + }, + { + "epoch": 5.427316293929713, + "grad_norm": 0.11715687066316605, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6795 + }, + { + "epoch": 5.428115015974441, + "grad_norm": 0.1196032389998436, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6796 + }, + { + "epoch": 5.428913738019169, + "grad_norm": 0.14141549170017242, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6797 + }, + { + "epoch": 5.4297124600638975, + "grad_norm": 0.12866206467151642, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6798 + }, + { + "epoch": 5.430511182108626, + "grad_norm": 0.10802716016769409, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6799 + }, + { + "epoch": 5.431309904153355, + "grad_norm": 0.10947239398956299, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6800 + }, + { + "epoch": 5.432108626198083, + "grad_norm": 0.08339721709489822, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6801 + }, + { + "epoch": 5.432907348242812, + "grad_norm": 0.12407296150922775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6802 + }, + { + "epoch": 5.43370607028754, + "grad_norm": 0.10537894070148468, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6803 + }, + { + "epoch": 5.434504792332269, + "grad_norm": 0.0920059084892273, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6804 + }, + { + "epoch": 5.435303514376997, + "grad_norm": 0.1502516269683838, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6805 + }, + { + "epoch": 5.436102236421725, + "grad_norm": 0.2798864245414734, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 6806 + }, + { + "epoch": 5.436900958466453, + "grad_norm": 0.11037585884332657, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6807 + }, + { + "epoch": 5.437699680511182, + "grad_norm": 0.12594881653785706, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6808 + }, + { + "epoch": 5.43849840255591, + "grad_norm": 0.09976109862327576, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6809 + }, + { + "epoch": 5.439297124600639, + "grad_norm": 0.3285512328147888, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6810 + }, + { + "epoch": 5.4400958466453675, + "grad_norm": 0.49450287222862244, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6811 + }, + { + "epoch": 5.440894568690096, + "grad_norm": 0.06817556917667389, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6812 + }, + { + "epoch": 5.4416932907348246, + "grad_norm": 0.14917057752609253, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6813 + }, + { + "epoch": 5.442492012779553, + "grad_norm": 0.10008134692907333, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6814 + }, + { + "epoch": 5.443290734824281, + "grad_norm": 0.07854767143726349, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6815 + }, + { + "epoch": 5.444089456869009, + "grad_norm": 0.2441248893737793, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6816 + }, + { + "epoch": 5.444888178913738, + "grad_norm": 0.1276157647371292, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6817 + }, + { + "epoch": 5.445686900958466, + "grad_norm": 0.11779431253671646, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6818 + }, + { + "epoch": 5.446485623003195, + "grad_norm": 0.11788108944892883, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6819 + }, + { + "epoch": 5.447284345047923, + "grad_norm": 0.06554995477199554, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6820 + }, + { + "epoch": 5.448083067092652, + "grad_norm": 0.07937108725309372, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6821 + }, + { + "epoch": 5.44888178913738, + "grad_norm": 0.08041426539421082, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6822 + }, + { + "epoch": 5.449680511182109, + "grad_norm": 0.12429161369800568, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6823 + }, + { + "epoch": 5.4504792332268375, + "grad_norm": 0.09993165731430054, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6824 + }, + { + "epoch": 5.451277955271565, + "grad_norm": 0.07077670097351074, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6825 + }, + { + "epoch": 5.452076677316294, + "grad_norm": 0.12163005024194717, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6826 + }, + { + "epoch": 5.452875399361022, + "grad_norm": 0.19080819189548492, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6827 + }, + { + "epoch": 5.453674121405751, + "grad_norm": 0.06450853496789932, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6828 + }, + { + "epoch": 5.454472843450479, + "grad_norm": 0.8893078565597534, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6829 + }, + { + "epoch": 5.455271565495208, + "grad_norm": 0.08225185424089432, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6830 + }, + { + "epoch": 5.456070287539936, + "grad_norm": 0.08631845563650131, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6831 + }, + { + "epoch": 5.456869009584665, + "grad_norm": 0.1858949214220047, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6832 + }, + { + "epoch": 5.457667731629393, + "grad_norm": 0.10997786372900009, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6833 + }, + { + "epoch": 5.458466453674122, + "grad_norm": 0.09691416472196579, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6834 + }, + { + "epoch": 5.4592651757188495, + "grad_norm": 0.12523561716079712, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6835 + }, + { + "epoch": 5.460063897763578, + "grad_norm": 0.10094364732503891, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6836 + }, + { + "epoch": 5.460862619808307, + "grad_norm": 0.06598310172557831, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6837 + }, + { + "epoch": 5.461661341853035, + "grad_norm": 0.10221479833126068, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6838 + }, + { + "epoch": 5.462460063897764, + "grad_norm": 0.6545975804328918, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6839 + }, + { + "epoch": 5.463258785942492, + "grad_norm": 0.12167128920555115, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6840 + }, + { + "epoch": 5.464057507987221, + "grad_norm": 0.10822924226522446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6841 + }, + { + "epoch": 5.464856230031949, + "grad_norm": 0.11905575543642044, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6842 + }, + { + "epoch": 5.465654952076678, + "grad_norm": 0.10276103764772415, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6843 + }, + { + "epoch": 5.466453674121405, + "grad_norm": 0.09087378531694412, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6844 + }, + { + "epoch": 5.467252396166134, + "grad_norm": 0.13117510080337524, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6845 + }, + { + "epoch": 5.468051118210862, + "grad_norm": 0.14824305474758148, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6846 + }, + { + "epoch": 5.468849840255591, + "grad_norm": 0.08553508669137955, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6847 + }, + { + "epoch": 5.4696485623003195, + "grad_norm": 0.12209141999483109, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6848 + }, + { + "epoch": 5.470447284345048, + "grad_norm": 0.1992058902978897, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6849 + }, + { + "epoch": 5.4712460063897765, + "grad_norm": 0.08518865704536438, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6850 + }, + { + "epoch": 5.472044728434505, + "grad_norm": 0.10496464371681213, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6851 + }, + { + "epoch": 5.472843450479234, + "grad_norm": 0.08789866417646408, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6852 + }, + { + "epoch": 5.473642172523961, + "grad_norm": 0.08592598885297775, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6853 + }, + { + "epoch": 5.47444089456869, + "grad_norm": 0.061165813356637955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6854 + }, + { + "epoch": 5.475239616613418, + "grad_norm": 0.06936467438936234, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6855 + }, + { + "epoch": 5.476038338658147, + "grad_norm": 0.20519734919071198, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6856 + }, + { + "epoch": 5.476837060702875, + "grad_norm": 0.087073415517807, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6857 + }, + { + "epoch": 5.477635782747604, + "grad_norm": 0.10153642296791077, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6858 + }, + { + "epoch": 5.478434504792332, + "grad_norm": 0.12416163831949234, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6859 + }, + { + "epoch": 5.479233226837061, + "grad_norm": 0.1047174334526062, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6860 + }, + { + "epoch": 5.4800319488817895, + "grad_norm": 0.13690868020057678, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6861 + }, + { + "epoch": 5.480830670926517, + "grad_norm": 0.15995970368385315, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6862 + }, + { + "epoch": 5.481629392971246, + "grad_norm": 0.08172900229692459, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6863 + }, + { + "epoch": 5.482428115015974, + "grad_norm": 0.10956761986017227, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6864 + }, + { + "epoch": 5.483226837060703, + "grad_norm": 0.12259931862354279, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6865 + }, + { + "epoch": 5.484025559105431, + "grad_norm": 0.08295698463916779, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6866 + }, + { + "epoch": 5.48482428115016, + "grad_norm": 0.10935505479574203, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6867 + }, + { + "epoch": 5.485623003194888, + "grad_norm": 0.12436006963253021, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6868 + }, + { + "epoch": 5.486421725239617, + "grad_norm": 0.08449307829141617, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6869 + }, + { + "epoch": 5.487220447284345, + "grad_norm": 0.10897113382816315, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6870 + }, + { + "epoch": 5.488019169329074, + "grad_norm": 0.06856910139322281, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6871 + }, + { + "epoch": 5.488817891373802, + "grad_norm": 0.07105988264083862, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6872 + }, + { + "epoch": 5.48961661341853, + "grad_norm": 0.08778723329305649, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6873 + }, + { + "epoch": 5.4904153354632586, + "grad_norm": 0.07818275690078735, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6874 + }, + { + "epoch": 5.491214057507987, + "grad_norm": 0.08410139381885529, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6875 + }, + { + "epoch": 5.492012779552716, + "grad_norm": 0.0804608166217804, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6876 + }, + { + "epoch": 5.492811501597444, + "grad_norm": 0.10089578479528427, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6877 + }, + { + "epoch": 5.493610223642173, + "grad_norm": 0.08231056481599808, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6878 + }, + { + "epoch": 5.494408945686901, + "grad_norm": 0.07642059773206711, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6879 + }, + { + "epoch": 5.49520766773163, + "grad_norm": 0.11312755942344666, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6880 + }, + { + "epoch": 5.496006389776358, + "grad_norm": 0.06288543343544006, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6881 + }, + { + "epoch": 5.496805111821086, + "grad_norm": 0.09648934751749039, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6882 + }, + { + "epoch": 5.497603833865814, + "grad_norm": 0.09374719858169556, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6883 + }, + { + "epoch": 5.498402555910543, + "grad_norm": 0.10596928000450134, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6884 + }, + { + "epoch": 5.4992012779552715, + "grad_norm": 0.06540077924728394, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6885 + }, + { + "epoch": 5.5, + "grad_norm": 0.05208199843764305, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6886 + }, + { + "epoch": 5.5007987220447285, + "grad_norm": 0.10762238502502441, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6887 + }, + { + "epoch": 5.501597444089457, + "grad_norm": 0.122553251683712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6888 + }, + { + "epoch": 5.502396166134186, + "grad_norm": 0.07663412392139435, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6889 + }, + { + "epoch": 5.503194888178914, + "grad_norm": 0.09100968390703201, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6890 + }, + { + "epoch": 5.503993610223642, + "grad_norm": 0.24931807816028595, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6891 + }, + { + "epoch": 5.50479233226837, + "grad_norm": 0.07812821120023727, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6892 + }, + { + "epoch": 5.505591054313099, + "grad_norm": 0.04760657623410225, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6893 + }, + { + "epoch": 5.506389776357827, + "grad_norm": 0.08183290809392929, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6894 + }, + { + "epoch": 5.507188498402556, + "grad_norm": 0.09541092067956924, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6895 + }, + { + "epoch": 5.507987220447284, + "grad_norm": 0.04168708249926567, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6896 + }, + { + "epoch": 5.508785942492013, + "grad_norm": 0.07038994133472443, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6897 + }, + { + "epoch": 5.5095846645367414, + "grad_norm": 0.060375142842531204, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6898 + }, + { + "epoch": 5.51038338658147, + "grad_norm": 0.048829223960638046, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6899 + }, + { + "epoch": 5.511182108626198, + "grad_norm": 0.057894766330718994, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6900 + }, + { + "epoch": 5.511980830670926, + "grad_norm": 0.05786101892590523, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6901 + }, + { + "epoch": 5.512779552715655, + "grad_norm": 0.07246953994035721, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6902 + }, + { + "epoch": 5.513578274760383, + "grad_norm": 0.07493462413549423, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6903 + }, + { + "epoch": 5.514376996805112, + "grad_norm": 0.060612600296735764, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6904 + }, + { + "epoch": 5.51517571884984, + "grad_norm": 0.0666302740573883, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6905 + }, + { + "epoch": 5.515974440894569, + "grad_norm": 0.08713024109601974, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6906 + }, + { + "epoch": 5.516773162939297, + "grad_norm": 0.31083860993385315, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6907 + }, + { + "epoch": 5.517571884984026, + "grad_norm": 0.0808933675289154, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6908 + }, + { + "epoch": 5.518370607028754, + "grad_norm": 0.1312016248703003, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6909 + }, + { + "epoch": 5.519169329073483, + "grad_norm": 0.20448890328407288, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6910 + }, + { + "epoch": 5.5199680511182105, + "grad_norm": 0.2519006133079529, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6911 + }, + { + "epoch": 5.520766773162939, + "grad_norm": 0.11359903216362, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6912 + }, + { + "epoch": 5.521565495207668, + "grad_norm": 0.07498760521411896, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6913 + }, + { + "epoch": 5.522364217252396, + "grad_norm": 0.06599561125040054, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6914 + }, + { + "epoch": 5.523162939297125, + "grad_norm": 0.08988697826862335, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6915 + }, + { + "epoch": 5.523961661341853, + "grad_norm": 0.06968241930007935, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6916 + }, + { + "epoch": 5.524760383386582, + "grad_norm": 0.07231415063142776, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6917 + }, + { + "epoch": 5.52555910543131, + "grad_norm": 0.07369428128004074, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6918 + }, + { + "epoch": 5.526357827476039, + "grad_norm": 0.07677069306373596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6919 + }, + { + "epoch": 5.527156549520766, + "grad_norm": 0.07391869276762009, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6920 + }, + { + "epoch": 5.527955271565495, + "grad_norm": 0.05270293354988098, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6921 + }, + { + "epoch": 5.5287539936102235, + "grad_norm": 0.10439106076955795, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6922 + }, + { + "epoch": 5.529552715654952, + "grad_norm": 0.06968904286623001, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6923 + }, + { + "epoch": 5.5303514376996805, + "grad_norm": 0.08401032537221909, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6924 + }, + { + "epoch": 5.531150159744409, + "grad_norm": 0.11993245035409927, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6925 + }, + { + "epoch": 5.531948881789138, + "grad_norm": 0.05857640504837036, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6926 + }, + { + "epoch": 5.532747603833866, + "grad_norm": 0.10513442009687424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6927 + }, + { + "epoch": 5.533546325878595, + "grad_norm": 0.12233056873083115, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6928 + }, + { + "epoch": 5.534345047923322, + "grad_norm": 0.06959997117519379, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6929 + }, + { + "epoch": 5.535143769968051, + "grad_norm": 0.08057182282209396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6930 + }, + { + "epoch": 5.535942492012779, + "grad_norm": 0.09816458821296692, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6931 + }, + { + "epoch": 5.536741214057508, + "grad_norm": 0.055738940834999084, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6932 + }, + { + "epoch": 5.537539936102236, + "grad_norm": 0.0939234122633934, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6933 + }, + { + "epoch": 5.538338658146965, + "grad_norm": 0.12143029272556305, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6934 + }, + { + "epoch": 5.539137380191693, + "grad_norm": 0.08409210294485092, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6935 + }, + { + "epoch": 5.539936102236422, + "grad_norm": 0.10690448433160782, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6936 + }, + { + "epoch": 5.5407348242811505, + "grad_norm": 0.20701836049556732, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6937 + }, + { + "epoch": 5.541533546325878, + "grad_norm": 0.09124163538217545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6938 + }, + { + "epoch": 5.542332268370607, + "grad_norm": 0.08295103162527084, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6939 + }, + { + "epoch": 5.543130990415335, + "grad_norm": 0.1179230809211731, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6940 + }, + { + "epoch": 5.543929712460064, + "grad_norm": 0.12345689535140991, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6941 + }, + { + "epoch": 5.544728434504792, + "grad_norm": 0.052616000175476074, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6942 + }, + { + "epoch": 5.545527156549521, + "grad_norm": 0.07918131351470947, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6943 + }, + { + "epoch": 5.546325878594249, + "grad_norm": 0.04847119748592377, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6944 + }, + { + "epoch": 5.547124600638978, + "grad_norm": 0.06204143166542053, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6945 + }, + { + "epoch": 5.547923322683706, + "grad_norm": 0.07778293639421463, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6946 + }, + { + "epoch": 5.548722044728435, + "grad_norm": 0.05037623643875122, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6947 + }, + { + "epoch": 5.549520766773163, + "grad_norm": 0.09024710208177567, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 6948 + }, + { + "epoch": 5.550319488817891, + "grad_norm": 0.0872211754322052, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6949 + }, + { + "epoch": 5.55111821086262, + "grad_norm": 0.08456625789403915, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6950 + }, + { + "epoch": 5.551916932907348, + "grad_norm": 0.054692018777132034, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6951 + }, + { + "epoch": 5.552715654952077, + "grad_norm": 0.10690787434577942, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6952 + }, + { + "epoch": 5.553514376996805, + "grad_norm": 0.07764400541782379, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6953 + }, + { + "epoch": 5.554313099041534, + "grad_norm": 0.08423051983118057, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6954 + }, + { + "epoch": 5.555111821086262, + "grad_norm": 0.06771727651357651, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6955 + }, + { + "epoch": 5.555910543130991, + "grad_norm": 0.10505887866020203, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6956 + }, + { + "epoch": 5.556709265175719, + "grad_norm": 0.054641906172037125, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6957 + }, + { + "epoch": 5.557507987220447, + "grad_norm": 0.05115118622779846, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6958 + }, + { + "epoch": 5.5583067092651754, + "grad_norm": 0.07177245616912842, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6959 + }, + { + "epoch": 5.559105431309904, + "grad_norm": 0.06642751395702362, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6960 + }, + { + "epoch": 5.5599041533546325, + "grad_norm": 0.08428867161273956, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6961 + }, + { + "epoch": 5.560702875399361, + "grad_norm": 0.044375378638505936, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6962 + }, + { + "epoch": 5.56150159744409, + "grad_norm": 0.06384986639022827, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6963 + }, + { + "epoch": 5.562300319488818, + "grad_norm": 0.052885912358760834, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6964 + }, + { + "epoch": 5.563099041533547, + "grad_norm": 0.05244029313325882, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6965 + }, + { + "epoch": 5.563897763578275, + "grad_norm": 0.1781054139137268, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6966 + }, + { + "epoch": 5.564696485623003, + "grad_norm": 0.8067191243171692, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6967 + }, + { + "epoch": 5.565495207667731, + "grad_norm": 0.0759076327085495, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6968 + }, + { + "epoch": 5.56629392971246, + "grad_norm": 0.0820186585187912, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6969 + }, + { + "epoch": 5.567092651757188, + "grad_norm": 2.901848316192627, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6970 + }, + { + "epoch": 5.567891373801917, + "grad_norm": 0.5663259625434875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6971 + }, + { + "epoch": 5.568690095846645, + "grad_norm": 0.34909728169441223, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6972 + }, + { + "epoch": 5.569488817891374, + "grad_norm": 0.3031843602657318, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6973 + }, + { + "epoch": 5.5702875399361025, + "grad_norm": 0.9258882403373718, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6974 + }, + { + "epoch": 5.571086261980831, + "grad_norm": 0.37162891030311584, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6975 + }, + { + "epoch": 5.571884984025559, + "grad_norm": 0.11269918829202652, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6976 + }, + { + "epoch": 5.572683706070287, + "grad_norm": 0.20953021943569183, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6977 + }, + { + "epoch": 5.573482428115016, + "grad_norm": 0.22324982285499573, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6978 + }, + { + "epoch": 5.574281150159744, + "grad_norm": 0.47017180919647217, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6979 + }, + { + "epoch": 5.575079872204473, + "grad_norm": 0.22266747057437897, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 6980 + }, + { + "epoch": 5.575878594249201, + "grad_norm": 0.1609373688697815, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6981 + }, + { + "epoch": 5.57667731629393, + "grad_norm": 0.17458784580230713, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6982 + }, + { + "epoch": 5.577476038338658, + "grad_norm": 0.17354144155979156, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6983 + }, + { + "epoch": 5.578274760383387, + "grad_norm": 0.10959888994693756, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6984 + }, + { + "epoch": 5.5790734824281145, + "grad_norm": 0.22630754113197327, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6985 + }, + { + "epoch": 5.579872204472844, + "grad_norm": 0.3786774277687073, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6986 + }, + { + "epoch": 5.580670926517572, + "grad_norm": 0.13818539679050446, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 6987 + }, + { + "epoch": 5.5814696485623, + "grad_norm": 0.22202269732952118, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6988 + }, + { + "epoch": 5.582268370607029, + "grad_norm": 0.08324426412582397, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6989 + }, + { + "epoch": 5.583067092651757, + "grad_norm": 0.16399513185024261, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6990 + }, + { + "epoch": 5.583865814696486, + "grad_norm": 0.13956478238105774, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6991 + }, + { + "epoch": 5.584664536741214, + "grad_norm": 0.09159751981496811, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6992 + }, + { + "epoch": 5.585463258785943, + "grad_norm": 0.19404387474060059, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6993 + }, + { + "epoch": 5.586261980830671, + "grad_norm": 0.07866083085536957, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6994 + }, + { + "epoch": 5.5870607028754, + "grad_norm": 0.10653684288263321, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6995 + }, + { + "epoch": 5.587859424920127, + "grad_norm": 0.12254250794649124, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6996 + }, + { + "epoch": 5.588658146964856, + "grad_norm": 0.0665711760520935, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6997 + }, + { + "epoch": 5.5894568690095845, + "grad_norm": 0.1234782338142395, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6998 + }, + { + "epoch": 5.590255591054313, + "grad_norm": 0.10345113277435303, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6999 + }, + { + "epoch": 5.5910543130990416, + "grad_norm": 0.10187766700983047, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7000 + }, + { + "epoch": 5.59185303514377, + "grad_norm": 0.10330864042043686, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7001 + }, + { + "epoch": 5.592651757188499, + "grad_norm": 0.12427254766225815, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7002 + }, + { + "epoch": 5.593450479233227, + "grad_norm": 0.06854265183210373, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7003 + }, + { + "epoch": 5.594249201277956, + "grad_norm": 0.07029487192630768, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7004 + }, + { + "epoch": 5.595047923322683, + "grad_norm": 0.07483061403036118, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7005 + }, + { + "epoch": 5.595846645367412, + "grad_norm": 0.08542168885469437, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7006 + }, + { + "epoch": 5.59664536741214, + "grad_norm": 0.05537399277091026, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7007 + }, + { + "epoch": 5.597444089456869, + "grad_norm": 0.28531956672668457, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7008 + }, + { + "epoch": 5.598242811501597, + "grad_norm": 0.1349600851535797, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7009 + }, + { + "epoch": 5.599041533546326, + "grad_norm": 0.06000711768865585, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7010 + }, + { + "epoch": 5.5998402555910545, + "grad_norm": 0.08139210939407349, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 7011 + }, + { + "epoch": 5.600638977635783, + "grad_norm": 0.08603602647781372, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7012 + }, + { + "epoch": 5.6014376996805115, + "grad_norm": 0.06586270034313202, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7013 + }, + { + "epoch": 5.602236421725239, + "grad_norm": 0.06276310235261917, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7014 + }, + { + "epoch": 5.603035143769968, + "grad_norm": 0.06072620674967766, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7015 + }, + { + "epoch": 5.603833865814696, + "grad_norm": 0.07509211450815201, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7016 + }, + { + "epoch": 5.604632587859425, + "grad_norm": 0.07241938263177872, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7017 + }, + { + "epoch": 5.605431309904153, + "grad_norm": 0.05110672488808632, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7018 + }, + { + "epoch": 5.606230031948882, + "grad_norm": 0.043005820363759995, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7019 + }, + { + "epoch": 5.60702875399361, + "grad_norm": 0.06298743188381195, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7020 + }, + { + "epoch": 5.607827476038339, + "grad_norm": 0.09457913786172867, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7021 + }, + { + "epoch": 5.608626198083067, + "grad_norm": 0.08066218346357346, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7022 + }, + { + "epoch": 5.609424920127795, + "grad_norm": 0.0845603421330452, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7023 + }, + { + "epoch": 5.6102236421725244, + "grad_norm": 0.09121926873922348, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7024 + }, + { + "epoch": 5.611022364217252, + "grad_norm": 0.12013491243124008, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7025 + }, + { + "epoch": 5.611821086261981, + "grad_norm": 0.062171660363674164, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7026 + }, + { + "epoch": 5.612619808306709, + "grad_norm": 0.05688954144716263, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7027 + }, + { + "epoch": 5.613418530351438, + "grad_norm": 0.049224793910980225, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7028 + }, + { + "epoch": 5.614217252396166, + "grad_norm": 0.06337599456310272, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7029 + }, + { + "epoch": 5.615015974440895, + "grad_norm": 0.03602084144949913, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7030 + }, + { + "epoch": 5.615814696485623, + "grad_norm": 0.06257645785808563, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7031 + }, + { + "epoch": 5.616613418530352, + "grad_norm": 0.09524381905794144, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7032 + }, + { + "epoch": 5.61741214057508, + "grad_norm": 0.06262468546628952, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7033 + }, + { + "epoch": 5.618210862619808, + "grad_norm": 0.23001722991466522, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7034 + }, + { + "epoch": 5.6190095846645365, + "grad_norm": 0.06312809139490128, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7035 + }, + { + "epoch": 5.619808306709265, + "grad_norm": 0.055973440408706665, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7036 + }, + { + "epoch": 5.6206070287539935, + "grad_norm": 0.0943455770611763, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7037 + }, + { + "epoch": 5.621405750798722, + "grad_norm": 0.05577901378273964, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7038 + }, + { + "epoch": 5.622204472843451, + "grad_norm": 0.057599395513534546, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7039 + }, + { + "epoch": 5.623003194888179, + "grad_norm": 0.07785748690366745, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7040 + }, + { + "epoch": 5.623801916932908, + "grad_norm": 0.04796557500958443, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7041 + }, + { + "epoch": 5.624600638977636, + "grad_norm": 0.19438667595386505, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7042 + }, + { + "epoch": 5.625399361022364, + "grad_norm": 0.10055433958768845, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7043 + }, + { + "epoch": 5.626198083067092, + "grad_norm": 0.06082126125693321, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7044 + }, + { + "epoch": 5.626996805111821, + "grad_norm": 0.07862866669893265, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7045 + }, + { + "epoch": 5.627795527156549, + "grad_norm": 0.09042234718799591, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7046 + }, + { + "epoch": 5.628594249201278, + "grad_norm": 0.06087128072977066, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7047 + }, + { + "epoch": 5.6293929712460065, + "grad_norm": 0.04091280326247215, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7048 + }, + { + "epoch": 5.630191693290735, + "grad_norm": 0.0625537633895874, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7049 + }, + { + "epoch": 5.6309904153354635, + "grad_norm": 0.04506808891892433, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7050 + }, + { + "epoch": 5.631789137380192, + "grad_norm": 0.0750357061624527, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7051 + }, + { + "epoch": 5.63258785942492, + "grad_norm": 0.06990372389554977, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7052 + }, + { + "epoch": 5.633386581469648, + "grad_norm": 0.05008876323699951, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7053 + }, + { + "epoch": 5.634185303514377, + "grad_norm": 0.07472547143697739, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7054 + }, + { + "epoch": 5.634984025559105, + "grad_norm": 0.04004117101430893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7055 + }, + { + "epoch": 5.635782747603834, + "grad_norm": 0.10103464871644974, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7056 + }, + { + "epoch": 5.636581469648562, + "grad_norm": 0.10850277543067932, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7057 + }, + { + "epoch": 5.637380191693291, + "grad_norm": 0.1109318807721138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7058 + }, + { + "epoch": 5.638178913738019, + "grad_norm": 0.06371457874774933, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7059 + }, + { + "epoch": 5.638977635782748, + "grad_norm": 0.1320749819278717, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7060 + }, + { + "epoch": 5.6397763578274756, + "grad_norm": 0.11957977712154388, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7061 + }, + { + "epoch": 5.640575079872205, + "grad_norm": 0.10327479988336563, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7062 + }, + { + "epoch": 5.641373801916933, + "grad_norm": 0.09731981158256531, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7063 + }, + { + "epoch": 5.642172523961661, + "grad_norm": 0.10276936739683151, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7064 + }, + { + "epoch": 5.64297124600639, + "grad_norm": 0.06973864883184433, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7065 + }, + { + "epoch": 5.643769968051118, + "grad_norm": 0.12020955234766006, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7066 + }, + { + "epoch": 5.644568690095847, + "grad_norm": 0.15950947999954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7067 + }, + { + "epoch": 5.645367412140575, + "grad_norm": 0.08034086227416992, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7068 + }, + { + "epoch": 5.646166134185304, + "grad_norm": 0.11269761621952057, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7069 + }, + { + "epoch": 5.646964856230032, + "grad_norm": 0.1569385826587677, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7070 + }, + { + "epoch": 5.647763578274761, + "grad_norm": 0.09290867298841476, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7071 + }, + { + "epoch": 5.6485623003194885, + "grad_norm": 0.0742817223072052, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7072 + }, + { + "epoch": 5.649361022364217, + "grad_norm": 0.3531377911567688, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7073 + }, + { + "epoch": 5.6501597444089455, + "grad_norm": 0.05365251749753952, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7074 + }, + { + "epoch": 5.650958466453674, + "grad_norm": 0.10185245424509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7075 + }, + { + "epoch": 5.651757188498403, + "grad_norm": 0.08978144079446793, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7076 + }, + { + "epoch": 5.652555910543131, + "grad_norm": 0.06563816964626312, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7077 + }, + { + "epoch": 5.65335463258786, + "grad_norm": 0.11167218536138535, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7078 + }, + { + "epoch": 5.654153354632588, + "grad_norm": 0.10078081488609314, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7079 + }, + { + "epoch": 5.654952076677317, + "grad_norm": 0.04581546410918236, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7080 + }, + { + "epoch": 5.655750798722044, + "grad_norm": 0.04128880053758621, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7081 + }, + { + "epoch": 5.656549520766773, + "grad_norm": 0.0887683555483818, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7082 + }, + { + "epoch": 5.657348242811501, + "grad_norm": 0.06673122197389603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7083 + }, + { + "epoch": 5.65814696485623, + "grad_norm": 0.12348195165395737, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7084 + }, + { + "epoch": 5.6589456869009584, + "grad_norm": 0.04828948527574539, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7085 + }, + { + "epoch": 5.659744408945687, + "grad_norm": 0.09094297885894775, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7086 + }, + { + "epoch": 5.6605431309904155, + "grad_norm": 0.05775933712720871, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7087 + }, + { + "epoch": 5.661341853035144, + "grad_norm": 0.06460239738225937, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7088 + }, + { + "epoch": 5.662140575079873, + "grad_norm": 0.07246532291173935, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7089 + }, + { + "epoch": 5.6629392971246, + "grad_norm": 0.05635413900017738, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7090 + }, + { + "epoch": 5.663738019169329, + "grad_norm": 0.05866781249642372, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7091 + }, + { + "epoch": 5.664536741214057, + "grad_norm": 0.11024738848209381, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7092 + }, + { + "epoch": 5.665335463258786, + "grad_norm": 2.880472421646118, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7093 + }, + { + "epoch": 5.666134185303514, + "grad_norm": 0.147624671459198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7094 + }, + { + "epoch": 5.666932907348243, + "grad_norm": 0.16042540967464447, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7095 + }, + { + "epoch": 5.667731629392971, + "grad_norm": 0.044081881642341614, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7096 + }, + { + "epoch": 5.6685303514377, + "grad_norm": 0.1580066829919815, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7097 + }, + { + "epoch": 5.669329073482428, + "grad_norm": 0.1348607987165451, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7098 + }, + { + "epoch": 5.670127795527156, + "grad_norm": 0.06525023281574249, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7099 + }, + { + "epoch": 5.6709265175718855, + "grad_norm": 0.12954704463481903, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7100 + }, + { + "epoch": 5.671725239616613, + "grad_norm": 0.09241525083780289, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7101 + }, + { + "epoch": 5.672523961661342, + "grad_norm": 0.05581163614988327, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7102 + }, + { + "epoch": 5.67332268370607, + "grad_norm": 0.0864885225892067, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7103 + }, + { + "epoch": 5.674121405750799, + "grad_norm": 0.0783633440732956, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7104 + }, + { + "epoch": 5.674920127795527, + "grad_norm": 2.419416666030884, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7105 + }, + { + "epoch": 5.675718849840256, + "grad_norm": 0.30067741870880127, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7106 + }, + { + "epoch": 5.676517571884984, + "grad_norm": 0.2876960337162018, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 7107 + }, + { + "epoch": 5.677316293929713, + "grad_norm": 0.13828304409980774, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7108 + }, + { + "epoch": 5.678115015974441, + "grad_norm": 0.12691721320152283, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7109 + }, + { + "epoch": 5.678913738019169, + "grad_norm": 0.18356311321258545, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 7110 + }, + { + "epoch": 5.6797124600638975, + "grad_norm": 0.13121426105499268, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7111 + }, + { + "epoch": 5.680511182108626, + "grad_norm": 0.13354304432868958, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7112 + }, + { + "epoch": 5.681309904153355, + "grad_norm": 0.10858450084924698, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7113 + }, + { + "epoch": 5.682108626198083, + "grad_norm": 0.12026678770780563, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 7114 + }, + { + "epoch": 5.682907348242812, + "grad_norm": 0.10297723114490509, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7115 + }, + { + "epoch": 5.68370607028754, + "grad_norm": 0.10481604188680649, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7116 + }, + { + "epoch": 5.684504792332269, + "grad_norm": 0.1389889419078827, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7117 + }, + { + "epoch": 5.685303514376997, + "grad_norm": 0.047913264483213425, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7118 + }, + { + "epoch": 5.686102236421725, + "grad_norm": 0.07504977285861969, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7119 + }, + { + "epoch": 5.686900958466453, + "grad_norm": 0.08858702331781387, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7120 + }, + { + "epoch": 5.687699680511182, + "grad_norm": 0.07746905088424683, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7121 + }, + { + "epoch": 5.68849840255591, + "grad_norm": 0.20370569825172424, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7122 + }, + { + "epoch": 5.689297124600639, + "grad_norm": 0.053284503519535065, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7123 + }, + { + "epoch": 5.6900958466453675, + "grad_norm": 0.08579347282648087, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7124 + }, + { + "epoch": 5.690894568690096, + "grad_norm": 0.11220933496952057, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7125 + }, + { + "epoch": 5.6916932907348246, + "grad_norm": 0.11851351708173752, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 7126 + }, + { + "epoch": 5.692492012779553, + "grad_norm": 0.0839112401008606, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7127 + }, + { + "epoch": 5.693290734824281, + "grad_norm": 0.07717803865671158, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7128 + }, + { + "epoch": 5.694089456869009, + "grad_norm": 0.10219333320856094, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7129 + }, + { + "epoch": 5.694888178913738, + "grad_norm": 0.06746016442775726, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7130 + }, + { + "epoch": 5.695686900958466, + "grad_norm": 0.09630785137414932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7131 + }, + { + "epoch": 5.696485623003195, + "grad_norm": 0.059845466166734695, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7132 + }, + { + "epoch": 5.697284345047923, + "grad_norm": 0.10587267577648163, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7133 + }, + { + "epoch": 5.698083067092652, + "grad_norm": 0.12221334874629974, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7134 + }, + { + "epoch": 5.69888178913738, + "grad_norm": 0.1638030856847763, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7135 + }, + { + "epoch": 5.699680511182109, + "grad_norm": 0.04686988145112991, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7136 + }, + { + "epoch": 5.700479233226837, + "grad_norm": 0.09120972454547882, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7137 + }, + { + "epoch": 5.701277955271565, + "grad_norm": 0.1081257089972496, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7138 + }, + { + "epoch": 5.702076677316294, + "grad_norm": 0.07313218712806702, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7139 + }, + { + "epoch": 5.702875399361022, + "grad_norm": 0.06039511039853096, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7140 + }, + { + "epoch": 5.703674121405751, + "grad_norm": 0.14473693072795868, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7141 + }, + { + "epoch": 5.704472843450479, + "grad_norm": 0.15062592923641205, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7142 + }, + { + "epoch": 5.705271565495208, + "grad_norm": 0.09711029380559921, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7143 + }, + { + "epoch": 5.706070287539936, + "grad_norm": 0.056874651461839676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7144 + }, + { + "epoch": 5.706869009584665, + "grad_norm": 0.1077205091714859, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7145 + }, + { + "epoch": 5.707667731629393, + "grad_norm": 0.1437366008758545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7146 + }, + { + "epoch": 5.708466453674122, + "grad_norm": 0.06206873059272766, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7147 + }, + { + "epoch": 5.7092651757188495, + "grad_norm": 0.06379563361406326, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7148 + }, + { + "epoch": 5.710063897763578, + "grad_norm": 0.11586727946996689, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7149 + }, + { + "epoch": 5.710862619808307, + "grad_norm": 0.12792269885540009, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7150 + }, + { + "epoch": 5.711661341853035, + "grad_norm": 0.08514344692230225, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7151 + }, + { + "epoch": 5.712460063897764, + "grad_norm": 0.045359376817941666, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7152 + }, + { + "epoch": 5.713258785942492, + "grad_norm": 0.13782942295074463, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7153 + }, + { + "epoch": 5.714057507987221, + "grad_norm": 0.1362733691930771, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7154 + }, + { + "epoch": 5.714856230031949, + "grad_norm": 0.11249929666519165, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7155 + }, + { + "epoch": 5.715654952076678, + "grad_norm": 0.07308060675859451, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7156 + }, + { + "epoch": 5.716453674121405, + "grad_norm": 0.08434231579303741, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7157 + }, + { + "epoch": 5.717252396166134, + "grad_norm": 0.0800870731472969, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7158 + }, + { + "epoch": 5.718051118210862, + "grad_norm": 0.09833595156669617, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7159 + }, + { + "epoch": 5.718849840255591, + "grad_norm": 0.06979871541261673, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7160 + }, + { + "epoch": 5.7196485623003195, + "grad_norm": 0.3326590657234192, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7161 + }, + { + "epoch": 5.720447284345048, + "grad_norm": 0.07953538745641708, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7162 + }, + { + "epoch": 5.7212460063897765, + "grad_norm": 0.06084589287638664, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7163 + }, + { + "epoch": 5.722044728434505, + "grad_norm": 0.05060078203678131, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7164 + }, + { + "epoch": 5.722843450479234, + "grad_norm": 0.11765584349632263, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7165 + }, + { + "epoch": 5.723642172523961, + "grad_norm": 0.11147762089967728, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7166 + }, + { + "epoch": 5.72444089456869, + "grad_norm": 0.051353756338357925, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7167 + }, + { + "epoch": 5.725239616613418, + "grad_norm": 0.06255709379911423, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7168 + }, + { + "epoch": 5.726038338658147, + "grad_norm": 0.048915427178144455, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7169 + }, + { + "epoch": 5.726837060702875, + "grad_norm": 0.057233601808547974, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7170 + }, + { + "epoch": 5.727635782747604, + "grad_norm": 0.0828251764178276, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7171 + }, + { + "epoch": 5.728434504792332, + "grad_norm": 0.07387874275445938, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7172 + }, + { + "epoch": 5.729233226837061, + "grad_norm": 0.04857983812689781, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7173 + }, + { + "epoch": 5.7300319488817895, + "grad_norm": 0.07202452421188354, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7174 + }, + { + "epoch": 5.730830670926517, + "grad_norm": 0.4291386306285858, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7175 + }, + { + "epoch": 5.731629392971246, + "grad_norm": 0.07219598442316055, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7176 + }, + { + "epoch": 5.732428115015974, + "grad_norm": 0.07889580726623535, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7177 + }, + { + "epoch": 5.733226837060703, + "grad_norm": 0.1154242753982544, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7178 + }, + { + "epoch": 5.734025559105431, + "grad_norm": 0.1711360067129135, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7179 + }, + { + "epoch": 5.73482428115016, + "grad_norm": 0.15897679328918457, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7180 + }, + { + "epoch": 5.735623003194888, + "grad_norm": 0.056718453764915466, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7181 + }, + { + "epoch": 5.736421725239617, + "grad_norm": 0.10130516439676285, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7182 + }, + { + "epoch": 5.737220447284345, + "grad_norm": 0.10965991020202637, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7183 + }, + { + "epoch": 5.738019169329074, + "grad_norm": 0.043925706297159195, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7184 + }, + { + "epoch": 5.738817891373802, + "grad_norm": 0.16040641069412231, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7185 + }, + { + "epoch": 5.73961661341853, + "grad_norm": 0.545796275138855, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7186 + }, + { + "epoch": 5.7404153354632586, + "grad_norm": 0.12285015732049942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7187 + }, + { + "epoch": 5.741214057507987, + "grad_norm": 0.1241980791091919, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7188 + }, + { + "epoch": 5.742012779552716, + "grad_norm": 0.18415005505084991, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7189 + }, + { + "epoch": 5.742811501597444, + "grad_norm": 0.1455639749765396, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7190 + }, + { + "epoch": 5.743610223642173, + "grad_norm": 0.05731341987848282, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7191 + }, + { + "epoch": 5.744408945686901, + "grad_norm": 0.10810694098472595, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7192 + }, + { + "epoch": 5.74520766773163, + "grad_norm": 0.13279423117637634, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7193 + }, + { + "epoch": 5.746006389776358, + "grad_norm": 0.048075832426548004, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7194 + }, + { + "epoch": 5.746805111821086, + "grad_norm": 0.07276510447263718, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7195 + }, + { + "epoch": 5.747603833865814, + "grad_norm": 0.0666821077466011, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7196 + }, + { + "epoch": 5.748402555910543, + "grad_norm": 0.0950300320982933, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7197 + }, + { + "epoch": 5.7492012779552715, + "grad_norm": 0.07229208946228027, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7198 + }, + { + "epoch": 5.75, + "grad_norm": 0.08129260689020157, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7199 + }, + { + "epoch": 5.7507987220447285, + "grad_norm": 0.08685708791017532, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7200 + }, + { + "epoch": 5.751597444089457, + "grad_norm": 0.048116523772478104, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7201 + }, + { + "epoch": 5.752396166134186, + "grad_norm": 0.08470416814088821, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7202 + }, + { + "epoch": 5.753194888178914, + "grad_norm": 0.09388689696788788, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7203 + }, + { + "epoch": 5.753993610223642, + "grad_norm": 0.07961093634366989, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7204 + }, + { + "epoch": 5.75479233226837, + "grad_norm": 0.05949364975094795, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7205 + }, + { + "epoch": 5.755591054313099, + "grad_norm": 0.10149726271629333, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7206 + }, + { + "epoch": 5.756389776357827, + "grad_norm": 0.30414992570877075, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7207 + }, + { + "epoch": 5.757188498402556, + "grad_norm": 0.06670042872428894, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7208 + }, + { + "epoch": 5.757987220447284, + "grad_norm": 0.061501920223236084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7209 + }, + { + "epoch": 5.758785942492013, + "grad_norm": 0.06627584993839264, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7210 + }, + { + "epoch": 5.7595846645367414, + "grad_norm": 0.1268157660961151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7211 + }, + { + "epoch": 5.76038338658147, + "grad_norm": 0.10253716260194778, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7212 + }, + { + "epoch": 5.761182108626198, + "grad_norm": 0.08384321630001068, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7213 + }, + { + "epoch": 5.761980830670926, + "grad_norm": 0.09078267216682434, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7214 + }, + { + "epoch": 5.762779552715655, + "grad_norm": 0.10487394034862518, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7215 + }, + { + "epoch": 5.763578274760383, + "grad_norm": 0.12192805856466293, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7216 + }, + { + "epoch": 5.764376996805112, + "grad_norm": 0.16597039997577667, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7217 + }, + { + "epoch": 5.76517571884984, + "grad_norm": 0.08498643338680267, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7218 + }, + { + "epoch": 5.765974440894569, + "grad_norm": 0.12794862687587738, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7219 + }, + { + "epoch": 5.766773162939297, + "grad_norm": 0.13595858216285706, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7220 + }, + { + "epoch": 5.767571884984026, + "grad_norm": 0.08182058483362198, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7221 + }, + { + "epoch": 5.768370607028754, + "grad_norm": 0.11747279763221741, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7222 + }, + { + "epoch": 5.769169329073483, + "grad_norm": 0.13400238752365112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7223 + }, + { + "epoch": 5.7699680511182105, + "grad_norm": 0.18527893722057343, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7224 + }, + { + "epoch": 5.770766773162939, + "grad_norm": 0.05130131170153618, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7225 + }, + { + "epoch": 5.771565495207668, + "grad_norm": 0.14139772951602936, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7226 + }, + { + "epoch": 5.772364217252396, + "grad_norm": 0.07901434600353241, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7227 + }, + { + "epoch": 5.773162939297125, + "grad_norm": 0.0642717182636261, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7228 + }, + { + "epoch": 5.773961661341853, + "grad_norm": 0.0693419873714447, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7229 + }, + { + "epoch": 5.774760383386582, + "grad_norm": 0.06490292400121689, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7230 + }, + { + "epoch": 5.77555910543131, + "grad_norm": 0.09405414760112762, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7231 + }, + { + "epoch": 5.776357827476039, + "grad_norm": 0.10439605265855789, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7232 + }, + { + "epoch": 5.777156549520766, + "grad_norm": 0.06811316311359406, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7233 + }, + { + "epoch": 5.777955271565495, + "grad_norm": 0.0707770362496376, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7234 + }, + { + "epoch": 5.7787539936102235, + "grad_norm": 0.08751409500837326, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7235 + }, + { + "epoch": 5.779552715654952, + "grad_norm": 0.09626015275716782, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7236 + }, + { + "epoch": 5.7803514376996805, + "grad_norm": 0.11487453430891037, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7237 + }, + { + "epoch": 5.781150159744409, + "grad_norm": 0.06278856843709946, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7238 + }, + { + "epoch": 5.781948881789138, + "grad_norm": 0.131802499294281, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7239 + }, + { + "epoch": 5.782747603833866, + "grad_norm": 0.09209976345300674, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7240 + }, + { + "epoch": 5.783546325878595, + "grad_norm": 0.06524617224931717, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7241 + }, + { + "epoch": 5.784345047923322, + "grad_norm": 0.10735169053077698, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7242 + }, + { + "epoch": 5.785143769968051, + "grad_norm": 0.08926022797822952, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7243 + }, + { + "epoch": 5.785942492012779, + "grad_norm": 0.08254969120025635, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7244 + }, + { + "epoch": 5.786741214057508, + "grad_norm": 0.07478158175945282, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7245 + }, + { + "epoch": 5.787539936102236, + "grad_norm": 0.0974164679646492, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7246 + }, + { + "epoch": 5.788338658146965, + "grad_norm": 0.05145352706313133, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7247 + }, + { + "epoch": 5.789137380191693, + "grad_norm": 0.11986715346574783, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7248 + }, + { + "epoch": 5.789936102236422, + "grad_norm": 0.12020506709814072, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7249 + }, + { + "epoch": 5.7907348242811505, + "grad_norm": 0.07199704647064209, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7250 + }, + { + "epoch": 5.791533546325878, + "grad_norm": 0.10702182352542877, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7251 + }, + { + "epoch": 5.792332268370607, + "grad_norm": 0.10817115753889084, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7252 + }, + { + "epoch": 5.793130990415335, + "grad_norm": 0.1875494122505188, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7253 + }, + { + "epoch": 5.793929712460064, + "grad_norm": 0.07347052544355392, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7254 + }, + { + "epoch": 5.794728434504792, + "grad_norm": 0.08588847517967224, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7255 + }, + { + "epoch": 5.795527156549521, + "grad_norm": 0.08241020143032074, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7256 + }, + { + "epoch": 5.796325878594249, + "grad_norm": 0.06322775781154633, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7257 + }, + { + "epoch": 5.797124600638978, + "grad_norm": 0.10279159247875214, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7258 + }, + { + "epoch": 5.797923322683706, + "grad_norm": 0.1887427717447281, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7259 + }, + { + "epoch": 5.798722044728435, + "grad_norm": 0.12288179248571396, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7260 + }, + { + "epoch": 5.799520766773163, + "grad_norm": 0.07014663517475128, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7261 + }, + { + "epoch": 5.800319488817891, + "grad_norm": 0.3741980493068695, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7262 + }, + { + "epoch": 5.80111821086262, + "grad_norm": 0.10083315521478653, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7263 + }, + { + "epoch": 5.801916932907348, + "grad_norm": 0.06427261233329773, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7264 + }, + { + "epoch": 5.802715654952077, + "grad_norm": 0.06265366077423096, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7265 + }, + { + "epoch": 5.803514376996805, + "grad_norm": 0.09602728486061096, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7266 + }, + { + "epoch": 5.804313099041534, + "grad_norm": 0.10369620472192764, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7267 + }, + { + "epoch": 5.805111821086262, + "grad_norm": 0.09742012619972229, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7268 + }, + { + "epoch": 5.805910543130991, + "grad_norm": 0.11579136550426483, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7269 + }, + { + "epoch": 5.806709265175719, + "grad_norm": 0.11265771090984344, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7270 + }, + { + "epoch": 5.807507987220447, + "grad_norm": 0.10684274882078171, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7271 + }, + { + "epoch": 5.8083067092651754, + "grad_norm": 0.12550850212574005, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7272 + }, + { + "epoch": 5.809105431309904, + "grad_norm": 0.04966668784618378, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7273 + }, + { + "epoch": 5.8099041533546325, + "grad_norm": 0.26124852895736694, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7274 + }, + { + "epoch": 5.810702875399361, + "grad_norm": 0.12293774634599686, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7275 + }, + { + "epoch": 5.81150159744409, + "grad_norm": 0.11183387041091919, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7276 + }, + { + "epoch": 5.812300319488818, + "grad_norm": 0.08738099783658981, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7277 + }, + { + "epoch": 5.813099041533547, + "grad_norm": 0.06429604440927505, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7278 + }, + { + "epoch": 5.813897763578275, + "grad_norm": 0.09102299064397812, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7279 + }, + { + "epoch": 5.814696485623003, + "grad_norm": 0.06249788776040077, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7280 + }, + { + "epoch": 5.815495207667731, + "grad_norm": 0.08752568066120148, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7281 + }, + { + "epoch": 5.81629392971246, + "grad_norm": 0.06289692968130112, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7282 + }, + { + "epoch": 5.817092651757188, + "grad_norm": 0.1269187480211258, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7283 + }, + { + "epoch": 5.817891373801917, + "grad_norm": 0.0839361846446991, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7284 + }, + { + "epoch": 5.818690095846645, + "grad_norm": 0.0855027437210083, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7285 + }, + { + "epoch": 5.819488817891374, + "grad_norm": 0.20559446513652802, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7286 + }, + { + "epoch": 5.8202875399361025, + "grad_norm": 0.0740990862250328, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7287 + }, + { + "epoch": 5.821086261980831, + "grad_norm": 0.06762924790382385, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7288 + }, + { + "epoch": 5.821884984025559, + "grad_norm": 0.5238296985626221, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7289 + }, + { + "epoch": 5.822683706070287, + "grad_norm": 0.09929470717906952, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7290 + }, + { + "epoch": 5.823482428115016, + "grad_norm": 0.11528550088405609, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7291 + }, + { + "epoch": 5.824281150159744, + "grad_norm": 0.10563576966524124, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 7292 + }, + { + "epoch": 5.825079872204473, + "grad_norm": 0.13924843072891235, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7293 + }, + { + "epoch": 5.825878594249201, + "grad_norm": 0.1332271546125412, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7294 + }, + { + "epoch": 5.82667731629393, + "grad_norm": 0.15709803998470306, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7295 + }, + { + "epoch": 5.827476038338658, + "grad_norm": 0.19638708233833313, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7296 + }, + { + "epoch": 5.828274760383387, + "grad_norm": 0.16845624148845673, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7297 + }, + { + "epoch": 5.8290734824281145, + "grad_norm": 0.15753695368766785, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7298 + }, + { + "epoch": 5.829872204472844, + "grad_norm": 0.04734346270561218, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7299 + }, + { + "epoch": 5.830670926517572, + "grad_norm": 0.48153460025787354, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7300 + }, + { + "epoch": 5.8314696485623, + "grad_norm": 0.09118880331516266, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7301 + }, + { + "epoch": 5.832268370607029, + "grad_norm": 0.10301438719034195, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7302 + }, + { + "epoch": 5.833067092651757, + "grad_norm": 0.12838974595069885, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7303 + }, + { + "epoch": 5.833865814696486, + "grad_norm": 0.1537700593471527, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7304 + }, + { + "epoch": 5.834664536741214, + "grad_norm": 0.08763979375362396, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7305 + }, + { + "epoch": 5.835463258785943, + "grad_norm": 0.2613058388233185, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7306 + }, + { + "epoch": 5.836261980830671, + "grad_norm": 0.13767825067043304, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7307 + }, + { + "epoch": 5.8370607028754, + "grad_norm": 0.14907905459403992, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7308 + }, + { + "epoch": 5.837859424920127, + "grad_norm": 0.3314233124256134, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 7309 + }, + { + "epoch": 5.838658146964856, + "grad_norm": 0.1368636041879654, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7310 + }, + { + "epoch": 5.8394568690095845, + "grad_norm": 0.13423767685890198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7311 + }, + { + "epoch": 5.840255591054313, + "grad_norm": 0.08914478868246078, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7312 + }, + { + "epoch": 5.8410543130990416, + "grad_norm": 0.09363356977701187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7313 + }, + { + "epoch": 5.84185303514377, + "grad_norm": 0.226780965924263, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7314 + }, + { + "epoch": 5.842651757188499, + "grad_norm": 0.09002092480659485, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7315 + }, + { + "epoch": 5.843450479233227, + "grad_norm": 0.06387127935886383, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7316 + }, + { + "epoch": 5.844249201277956, + "grad_norm": 0.1643945276737213, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7317 + }, + { + "epoch": 5.845047923322683, + "grad_norm": 0.13561291992664337, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7318 + }, + { + "epoch": 5.845846645367412, + "grad_norm": 0.14334949851036072, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7319 + }, + { + "epoch": 5.84664536741214, + "grad_norm": 0.13982698321342468, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7320 + }, + { + "epoch": 5.847444089456869, + "grad_norm": 0.10822772979736328, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7321 + }, + { + "epoch": 5.848242811501597, + "grad_norm": 0.07073087245225906, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7322 + }, + { + "epoch": 5.849041533546326, + "grad_norm": 0.09560684859752655, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7323 + }, + { + "epoch": 5.8498402555910545, + "grad_norm": 0.0882779061794281, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7324 + }, + { + "epoch": 5.850638977635783, + "grad_norm": 0.17319771647453308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7325 + }, + { + "epoch": 5.8514376996805115, + "grad_norm": 0.12140306830406189, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7326 + }, + { + "epoch": 5.852236421725239, + "grad_norm": 0.12064560502767563, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7327 + }, + { + "epoch": 5.853035143769968, + "grad_norm": 0.0733642578125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7328 + }, + { + "epoch": 5.853833865814696, + "grad_norm": 0.08563291281461716, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7329 + }, + { + "epoch": 5.854632587859425, + "grad_norm": 0.11337493360042572, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7330 + }, + { + "epoch": 5.855431309904153, + "grad_norm": 0.12164553254842758, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7331 + }, + { + "epoch": 5.856230031948882, + "grad_norm": 0.06406484544277191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7332 + }, + { + "epoch": 5.85702875399361, + "grad_norm": 0.0765780508518219, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7333 + }, + { + "epoch": 5.857827476038339, + "grad_norm": 0.12847815454006195, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7334 + }, + { + "epoch": 5.858626198083067, + "grad_norm": 0.11934550106525421, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7335 + }, + { + "epoch": 5.859424920127795, + "grad_norm": 0.08170188963413239, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7336 + }, + { + "epoch": 5.8602236421725244, + "grad_norm": 0.13636507093906403, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7337 + }, + { + "epoch": 5.861022364217252, + "grad_norm": 0.11030741780996323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7338 + }, + { + "epoch": 5.861821086261981, + "grad_norm": 0.10200777649879456, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7339 + }, + { + "epoch": 5.862619808306709, + "grad_norm": 0.09916897118091583, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7340 + }, + { + "epoch": 5.863418530351438, + "grad_norm": 0.08136509358882904, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7341 + }, + { + "epoch": 5.864217252396166, + "grad_norm": 0.051609545946121216, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7342 + }, + { + "epoch": 5.865015974440895, + "grad_norm": 0.061890844255685806, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7343 + }, + { + "epoch": 5.865814696485623, + "grad_norm": 0.10308966040611267, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7344 + }, + { + "epoch": 5.866613418530352, + "grad_norm": 0.06762709468603134, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7345 + }, + { + "epoch": 5.86741214057508, + "grad_norm": 0.07767036557197571, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7346 + }, + { + "epoch": 5.868210862619808, + "grad_norm": 0.10608458518981934, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7347 + }, + { + "epoch": 5.8690095846645365, + "grad_norm": 0.13812315464019775, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7348 + }, + { + "epoch": 5.869808306709265, + "grad_norm": 0.10485442727804184, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7349 + }, + { + "epoch": 5.8706070287539935, + "grad_norm": 0.08510198444128036, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7350 + }, + { + "epoch": 5.871405750798722, + "grad_norm": 0.17235122621059418, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7351 + }, + { + "epoch": 5.872204472843451, + "grad_norm": 0.057075515389442444, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7352 + }, + { + "epoch": 5.873003194888179, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7353 + }, + { + "epoch": 5.873801916932908, + "grad_norm": 0.1859748661518097, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7354 + }, + { + "epoch": 5.874600638977636, + "grad_norm": 0.2350156307220459, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7355 + }, + { + "epoch": 5.875399361022364, + "grad_norm": 0.11264859884977341, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 7356 + }, + { + "epoch": 5.876198083067092, + "grad_norm": 0.2859210968017578, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7357 + }, + { + "epoch": 5.876996805111821, + "grad_norm": 0.08706829696893692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7358 + }, + { + "epoch": 5.877795527156549, + "grad_norm": 0.0644318088889122, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7359 + }, + { + "epoch": 5.878594249201278, + "grad_norm": 0.10985474288463593, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7360 + }, + { + "epoch": 5.8793929712460065, + "grad_norm": 0.09968867897987366, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7361 + }, + { + "epoch": 5.880191693290735, + "grad_norm": 0.07277355343103409, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7362 + }, + { + "epoch": 5.8809904153354635, + "grad_norm": 0.043085962533950806, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7363 + }, + { + "epoch": 5.881789137380192, + "grad_norm": 0.10392415523529053, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7364 + }, + { + "epoch": 5.88258785942492, + "grad_norm": 0.05523041635751724, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7365 + }, + { + "epoch": 5.883386581469648, + "grad_norm": 0.1754276603460312, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7366 + }, + { + "epoch": 5.884185303514377, + "grad_norm": 0.09561391174793243, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7367 + }, + { + "epoch": 5.884984025559105, + "grad_norm": 0.17572976648807526, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7368 + }, + { + "epoch": 5.885782747603834, + "grad_norm": 0.06476190686225891, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7369 + }, + { + "epoch": 5.886581469648562, + "grad_norm": 0.08763223886489868, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7370 + }, + { + "epoch": 5.887380191693291, + "grad_norm": 0.04419226944446564, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7371 + }, + { + "epoch": 5.888178913738019, + "grad_norm": 0.08707522600889206, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7372 + }, + { + "epoch": 5.888977635782748, + "grad_norm": 0.3117498457431793, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7373 + }, + { + "epoch": 5.8897763578274756, + "grad_norm": 0.04153338074684143, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7374 + }, + { + "epoch": 5.890575079872205, + "grad_norm": 0.10575849562883377, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7375 + }, + { + "epoch": 5.891373801916933, + "grad_norm": 0.07147886604070663, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7376 + }, + { + "epoch": 5.892172523961661, + "grad_norm": 0.05394810438156128, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7377 + }, + { + "epoch": 5.89297124600639, + "grad_norm": 0.15453197062015533, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7378 + }, + { + "epoch": 5.893769968051118, + "grad_norm": 0.19460639357566833, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7379 + }, + { + "epoch": 5.894568690095847, + "grad_norm": 0.13046157360076904, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7380 + }, + { + "epoch": 5.895367412140575, + "grad_norm": 0.09074800461530685, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7381 + }, + { + "epoch": 5.896166134185304, + "grad_norm": 0.09315948188304901, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7382 + }, + { + "epoch": 5.896964856230032, + "grad_norm": 0.0572352297604084, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7383 + }, + { + "epoch": 5.897763578274761, + "grad_norm": 0.09366700798273087, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7384 + }, + { + "epoch": 5.8985623003194885, + "grad_norm": 0.12643125653266907, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7385 + }, + { + "epoch": 5.899361022364217, + "grad_norm": 0.14831441640853882, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7386 + }, + { + "epoch": 5.9001597444089455, + "grad_norm": 0.06892798840999603, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7387 + }, + { + "epoch": 5.900958466453674, + "grad_norm": 0.24058189988136292, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7388 + }, + { + "epoch": 5.901757188498403, + "grad_norm": 0.12589944899082184, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7389 + }, + { + "epoch": 5.902555910543131, + "grad_norm": 0.10197508335113525, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7390 + }, + { + "epoch": 5.90335463258786, + "grad_norm": 0.04367182031273842, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7391 + }, + { + "epoch": 5.904153354632588, + "grad_norm": 0.11131702363491058, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7392 + }, + { + "epoch": 5.904952076677317, + "grad_norm": 0.10258752107620239, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7393 + }, + { + "epoch": 5.905750798722044, + "grad_norm": 0.05077935755252838, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7394 + }, + { + "epoch": 5.906549520766773, + "grad_norm": 0.13514964282512665, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7395 + }, + { + "epoch": 5.907348242811501, + "grad_norm": 0.365681916475296, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7396 + }, + { + "epoch": 5.90814696485623, + "grad_norm": 0.09199032932519913, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7397 + }, + { + "epoch": 5.9089456869009584, + "grad_norm": 0.10341943800449371, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7398 + }, + { + "epoch": 5.909744408945687, + "grad_norm": 0.05396822467446327, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7399 + }, + { + "epoch": 5.9105431309904155, + "grad_norm": 0.06582850217819214, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 7400 + }, + { + "epoch": 5.911341853035144, + "grad_norm": 0.04932714253664017, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7401 + }, + { + "epoch": 5.912140575079873, + "grad_norm": 0.08820181339979172, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7402 + }, + { + "epoch": 5.9129392971246, + "grad_norm": 0.08759067952632904, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7403 + }, + { + "epoch": 5.913738019169329, + "grad_norm": 0.0582246370613575, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7404 + }, + { + "epoch": 5.914536741214057, + "grad_norm": 0.3632248044013977, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7405 + }, + { + "epoch": 5.915335463258786, + "grad_norm": 0.054485730826854706, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7406 + }, + { + "epoch": 5.916134185303514, + "grad_norm": 0.06776587665081024, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7407 + }, + { + "epoch": 5.916932907348243, + "grad_norm": 0.06876091659069061, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7408 + }, + { + "epoch": 5.917731629392971, + "grad_norm": 0.06507224589586258, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7409 + }, + { + "epoch": 5.9185303514377, + "grad_norm": 1.061123013496399, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7410 + }, + { + "epoch": 5.919329073482428, + "grad_norm": 0.2808170020580292, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7411 + }, + { + "epoch": 5.920127795527156, + "grad_norm": 0.2075907289981842, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7412 + }, + { + "epoch": 5.9209265175718855, + "grad_norm": 0.08707362413406372, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7413 + }, + { + "epoch": 5.921725239616613, + "grad_norm": 0.17357248067855835, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 7414 + }, + { + "epoch": 5.922523961661342, + "grad_norm": 0.19713328778743744, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7415 + }, + { + "epoch": 5.92332268370607, + "grad_norm": 0.10456258803606033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7416 + }, + { + "epoch": 5.924121405750799, + "grad_norm": 0.10678638517856598, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7417 + }, + { + "epoch": 5.924920127795527, + "grad_norm": 0.12577000260353088, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7418 + }, + { + "epoch": 5.925718849840256, + "grad_norm": 0.14730660617351532, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7419 + }, + { + "epoch": 5.926517571884984, + "grad_norm": 0.07055118680000305, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7420 + }, + { + "epoch": 5.927316293929713, + "grad_norm": 0.10249259322881699, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7421 + }, + { + "epoch": 5.928115015974441, + "grad_norm": 0.06859050691127777, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7422 + }, + { + "epoch": 5.928913738019169, + "grad_norm": 0.043517664074897766, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7423 + }, + { + "epoch": 5.9297124600638975, + "grad_norm": 0.06680947542190552, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7424 + }, + { + "epoch": 5.930511182108626, + "grad_norm": 0.07522429525852203, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7425 + }, + { + "epoch": 5.931309904153355, + "grad_norm": 0.15828543901443481, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7426 + }, + { + "epoch": 5.932108626198083, + "grad_norm": 0.19134600460529327, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7427 + }, + { + "epoch": 5.932907348242812, + "grad_norm": 0.12455222010612488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7428 + }, + { + "epoch": 5.93370607028754, + "grad_norm": 0.11147905886173248, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7429 + }, + { + "epoch": 5.934504792332269, + "grad_norm": 0.1238674744963646, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7430 + }, + { + "epoch": 5.935303514376997, + "grad_norm": 0.15700307488441467, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7431 + }, + { + "epoch": 5.936102236421725, + "grad_norm": 0.11487080156803131, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7432 + }, + { + "epoch": 5.936900958466453, + "grad_norm": 0.11961077898740768, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7433 + }, + { + "epoch": 5.937699680511182, + "grad_norm": 0.07594173401594162, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7434 + }, + { + "epoch": 5.93849840255591, + "grad_norm": 0.19439400732517242, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7435 + }, + { + "epoch": 5.939297124600639, + "grad_norm": 0.17745599150657654, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7436 + }, + { + "epoch": 5.9400958466453675, + "grad_norm": 0.15732692182064056, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7437 + }, + { + "epoch": 5.940894568690096, + "grad_norm": 0.08824916929006577, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7438 + }, + { + "epoch": 5.9416932907348246, + "grad_norm": 0.12354888767004013, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7439 + }, + { + "epoch": 5.942492012779553, + "grad_norm": 0.10940376669168472, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7440 + }, + { + "epoch": 5.943290734824281, + "grad_norm": 0.05808279290795326, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7441 + }, + { + "epoch": 5.944089456869009, + "grad_norm": 0.19519653916358948, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 7442 + }, + { + "epoch": 5.944888178913738, + "grad_norm": 0.07913058996200562, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7443 + }, + { + "epoch": 5.945686900958466, + "grad_norm": 0.5150377750396729, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7444 + }, + { + "epoch": 5.946485623003195, + "grad_norm": 0.24083790183067322, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7445 + }, + { + "epoch": 5.947284345047923, + "grad_norm": 0.11291394382715225, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7446 + }, + { + "epoch": 5.948083067092652, + "grad_norm": 0.0899023786187172, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7447 + }, + { + "epoch": 5.94888178913738, + "grad_norm": 0.05489958077669144, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7448 + }, + { + "epoch": 5.949680511182109, + "grad_norm": 0.12375161051750183, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7449 + }, + { + "epoch": 5.950479233226837, + "grad_norm": 0.11610512435436249, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7450 + }, + { + "epoch": 5.951277955271565, + "grad_norm": 0.06953240931034088, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7451 + }, + { + "epoch": 5.952076677316294, + "grad_norm": 0.09784717857837677, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7452 + }, + { + "epoch": 5.952875399361022, + "grad_norm": 0.059533409774303436, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7453 + }, + { + "epoch": 5.953674121405751, + "grad_norm": 0.06361017376184464, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7454 + }, + { + "epoch": 5.954472843450479, + "grad_norm": 0.33739587664604187, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7455 + }, + { + "epoch": 5.955271565495208, + "grad_norm": 0.0726039931178093, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7456 + }, + { + "epoch": 5.956070287539936, + "grad_norm": 0.047813788056373596, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7457 + }, + { + "epoch": 5.956869009584665, + "grad_norm": 0.05501490831375122, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7458 + }, + { + "epoch": 5.957667731629393, + "grad_norm": 0.24806374311447144, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7459 + }, + { + "epoch": 5.958466453674122, + "grad_norm": 0.09020408987998962, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7460 + }, + { + "epoch": 5.9592651757188495, + "grad_norm": 0.09845588356256485, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7461 + }, + { + "epoch": 5.960063897763578, + "grad_norm": 0.2733388841152191, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7462 + }, + { + "epoch": 5.960862619808307, + "grad_norm": 0.04368302598595619, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7463 + }, + { + "epoch": 5.961661341853035, + "grad_norm": 0.06559797376394272, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7464 + }, + { + "epoch": 5.962460063897764, + "grad_norm": 0.08194267004728317, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7465 + }, + { + "epoch": 5.963258785942492, + "grad_norm": 0.08440488576889038, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7466 + }, + { + "epoch": 5.964057507987221, + "grad_norm": 0.07046753168106079, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7467 + }, + { + "epoch": 5.964856230031949, + "grad_norm": 0.061910174787044525, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7468 + }, + { + "epoch": 5.965654952076678, + "grad_norm": 0.06781110167503357, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7469 + }, + { + "epoch": 5.966453674121405, + "grad_norm": 0.0626576617360115, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7470 + }, + { + "epoch": 5.967252396166134, + "grad_norm": 0.05339542031288147, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7471 + }, + { + "epoch": 5.968051118210862, + "grad_norm": 0.09167633950710297, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7472 + }, + { + "epoch": 5.968849840255591, + "grad_norm": 0.07272132486104965, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7473 + }, + { + "epoch": 5.9696485623003195, + "grad_norm": 0.1218709796667099, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7474 + }, + { + "epoch": 5.970447284345048, + "grad_norm": 0.21024082601070404, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7475 + }, + { + "epoch": 5.9712460063897765, + "grad_norm": 0.08869504183530807, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7476 + }, + { + "epoch": 5.972044728434505, + "grad_norm": 0.05930836871266365, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7477 + }, + { + "epoch": 5.972843450479234, + "grad_norm": 0.10009569674730301, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7478 + }, + { + "epoch": 5.973642172523961, + "grad_norm": 0.2543089687824249, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7479 + }, + { + "epoch": 5.97444089456869, + "grad_norm": 0.04702993854880333, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7480 + }, + { + "epoch": 5.975239616613418, + "grad_norm": 0.12841154634952545, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7481 + }, + { + "epoch": 5.976038338658147, + "grad_norm": 0.10137920081615448, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7482 + }, + { + "epoch": 5.976837060702875, + "grad_norm": 0.0582512766122818, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7483 + }, + { + "epoch": 5.977635782747604, + "grad_norm": 0.06556501984596252, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7484 + }, + { + "epoch": 5.978434504792332, + "grad_norm": 0.2065235674381256, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7485 + }, + { + "epoch": 5.979233226837061, + "grad_norm": 0.07943716645240784, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7486 + }, + { + "epoch": 5.9800319488817895, + "grad_norm": 0.05257594957947731, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7487 + }, + { + "epoch": 5.980830670926517, + "grad_norm": 0.06949680298566818, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7488 + }, + { + "epoch": 5.981629392971246, + "grad_norm": 0.0967894196510315, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7489 + }, + { + "epoch": 5.982428115015974, + "grad_norm": 1.068231463432312, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7490 + }, + { + "epoch": 5.983226837060703, + "grad_norm": 0.0648348405957222, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7491 + }, + { + "epoch": 5.984025559105431, + "grad_norm": 0.2540450096130371, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7492 + }, + { + "epoch": 5.98482428115016, + "grad_norm": 0.1624346375465393, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7493 + }, + { + "epoch": 5.985623003194888, + "grad_norm": 0.10054703056812286, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7494 + }, + { + "epoch": 5.986421725239617, + "grad_norm": 0.05147058889269829, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7495 + }, + { + "epoch": 5.987220447284345, + "grad_norm": 0.10036633163690567, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7496 + }, + { + "epoch": 5.988019169329074, + "grad_norm": 0.14611777663230896, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7497 + }, + { + "epoch": 5.988817891373802, + "grad_norm": 0.12323570251464844, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7498 + }, + { + "epoch": 5.98961661341853, + "grad_norm": 0.04539888724684715, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7499 + }, + { + "epoch": 5.9904153354632586, + "grad_norm": 0.14555387198925018, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7500 + }, + { + "epoch": 5.991214057507987, + "grad_norm": 0.3205990195274353, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7501 + }, + { + "epoch": 5.992012779552716, + "grad_norm": 0.22900770604610443, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7502 + }, + { + "epoch": 5.992811501597444, + "grad_norm": 0.11138728260993958, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7503 + }, + { + "epoch": 5.993610223642173, + "grad_norm": 0.09425637125968933, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7504 + }, + { + "epoch": 5.994408945686901, + "grad_norm": 0.18409870564937592, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7505 + }, + { + "epoch": 5.99520766773163, + "grad_norm": 0.1610010713338852, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7506 + }, + { + "epoch": 5.996006389776358, + "grad_norm": 0.2304852306842804, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7507 + }, + { + "epoch": 5.996805111821086, + "grad_norm": 0.09830645471811295, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7508 + }, + { + "epoch": 5.997603833865814, + "grad_norm": 0.12319398671388626, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7509 + }, + { + "epoch": 5.998402555910543, + "grad_norm": 0.07925699651241302, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7510 + }, + { + "epoch": 5.9992012779552715, + "grad_norm": 0.07079242914915085, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7511 + }, + { + "epoch": 6.0, + "grad_norm": 0.14047275483608246, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7512 + }, + { + "epoch": 6.0007987220447285, + "grad_norm": 0.172583669424057, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7513 + }, + { + "epoch": 6.001597444089457, + "grad_norm": 0.3635086119174957, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7514 + }, + { + "epoch": 6.002396166134186, + "grad_norm": 0.14463695883750916, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7515 + }, + { + "epoch": 6.003194888178914, + "grad_norm": 0.24417585134506226, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7516 + }, + { + "epoch": 6.003993610223642, + "grad_norm": 0.25690382719039917, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7517 + }, + { + "epoch": 6.00479233226837, + "grad_norm": 0.12535394728183746, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 7518 + }, + { + "epoch": 6.005591054313099, + "grad_norm": 0.19279715418815613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7519 + }, + { + "epoch": 6.006389776357827, + "grad_norm": 0.10537917166948318, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7520 + }, + { + "epoch": 6.007188498402556, + "grad_norm": 0.07752633094787598, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7521 + }, + { + "epoch": 6.007987220447284, + "grad_norm": 0.10693971067667007, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7522 + }, + { + "epoch": 6.008785942492013, + "grad_norm": 0.06399057805538177, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7523 + }, + { + "epoch": 6.0095846645367414, + "grad_norm": 0.12577609717845917, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7524 + }, + { + "epoch": 6.01038338658147, + "grad_norm": 0.12770701944828033, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7525 + }, + { + "epoch": 6.0111821086261985, + "grad_norm": 0.07679085433483124, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7526 + }, + { + "epoch": 6.011980830670926, + "grad_norm": 0.14353524148464203, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7527 + }, + { + "epoch": 6.012779552715655, + "grad_norm": 0.3428184688091278, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7528 + }, + { + "epoch": 6.013578274760383, + "grad_norm": 0.1436242014169693, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 7529 + }, + { + "epoch": 6.014376996805112, + "grad_norm": 0.07608507573604584, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7530 + }, + { + "epoch": 6.01517571884984, + "grad_norm": 0.10932086408138275, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7531 + }, + { + "epoch": 6.015974440894569, + "grad_norm": 0.07631878554821014, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7532 + }, + { + "epoch": 6.016773162939297, + "grad_norm": 0.0718175396323204, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7533 + }, + { + "epoch": 6.017571884984026, + "grad_norm": 0.07661164551973343, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7534 + }, + { + "epoch": 6.018370607028754, + "grad_norm": 0.10753245651721954, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7535 + }, + { + "epoch": 6.019169329073482, + "grad_norm": 0.12740729749202728, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7536 + }, + { + "epoch": 6.0199680511182105, + "grad_norm": 0.14345388114452362, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7537 + }, + { + "epoch": 6.020766773162939, + "grad_norm": 0.13860031962394714, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7538 + }, + { + "epoch": 6.021565495207668, + "grad_norm": 0.07766555994749069, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7539 + }, + { + "epoch": 6.022364217252396, + "grad_norm": 0.11253347247838974, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7540 + }, + { + "epoch": 6.023162939297125, + "grad_norm": 0.18870452046394348, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7541 + }, + { + "epoch": 6.023961661341853, + "grad_norm": 0.12401654571294785, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7542 + }, + { + "epoch": 6.024760383386582, + "grad_norm": 0.08025321364402771, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7543 + }, + { + "epoch": 6.02555910543131, + "grad_norm": 0.12504157423973083, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7544 + }, + { + "epoch": 6.026357827476039, + "grad_norm": 0.07099851220846176, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7545 + }, + { + "epoch": 6.027156549520766, + "grad_norm": 0.09573683142662048, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7546 + }, + { + "epoch": 6.027955271565495, + "grad_norm": 0.18280553817749023, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7547 + }, + { + "epoch": 6.0287539936102235, + "grad_norm": 0.15688058733940125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7548 + }, + { + "epoch": 6.029552715654952, + "grad_norm": 0.11738436669111252, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7549 + }, + { + "epoch": 6.0303514376996805, + "grad_norm": 1.275103211402893, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7550 + }, + { + "epoch": 6.031150159744409, + "grad_norm": 0.39542102813720703, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7551 + }, + { + "epoch": 6.031948881789138, + "grad_norm": 0.32140371203422546, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7552 + }, + { + "epoch": 6.032747603833866, + "grad_norm": 0.2855371832847595, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7553 + }, + { + "epoch": 6.033546325878595, + "grad_norm": 0.14987513422966003, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7554 + }, + { + "epoch": 6.034345047923322, + "grad_norm": 0.25978198647499084, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7555 + }, + { + "epoch": 6.035143769968051, + "grad_norm": 0.14043942093849182, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7556 + }, + { + "epoch": 6.035942492012779, + "grad_norm": 0.16670344769954681, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7557 + }, + { + "epoch": 6.036741214057508, + "grad_norm": 0.1668681800365448, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7558 + }, + { + "epoch": 6.037539936102236, + "grad_norm": 0.11135906726121902, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7559 + }, + { + "epoch": 6.038338658146965, + "grad_norm": 0.26222026348114014, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7560 + }, + { + "epoch": 6.039137380191693, + "grad_norm": 0.1670113205909729, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7561 + }, + { + "epoch": 6.039936102236422, + "grad_norm": 0.15860766172409058, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7562 + }, + { + "epoch": 6.0407348242811505, + "grad_norm": 0.2577793300151825, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7563 + }, + { + "epoch": 6.041533546325879, + "grad_norm": 0.11147591471672058, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7564 + }, + { + "epoch": 6.042332268370607, + "grad_norm": 0.18452385067939758, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7565 + }, + { + "epoch": 6.043130990415335, + "grad_norm": 0.19697625935077667, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7566 + }, + { + "epoch": 6.043929712460064, + "grad_norm": 0.08586452901363373, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7567 + }, + { + "epoch": 6.044728434504792, + "grad_norm": 0.18721693754196167, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7568 + }, + { + "epoch": 6.045527156549521, + "grad_norm": 0.13190758228302002, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7569 + }, + { + "epoch": 6.046325878594249, + "grad_norm": 0.09424075484275818, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7570 + }, + { + "epoch": 6.047124600638978, + "grad_norm": 0.15252210199832916, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7571 + }, + { + "epoch": 6.047923322683706, + "grad_norm": 0.06378420442342758, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7572 + }, + { + "epoch": 6.048722044728435, + "grad_norm": 0.07665325701236725, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7573 + }, + { + "epoch": 6.0495207667731625, + "grad_norm": 0.0847245529294014, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7574 + }, + { + "epoch": 6.050319488817891, + "grad_norm": 0.034070566296577454, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7575 + }, + { + "epoch": 6.05111821086262, + "grad_norm": 0.08149915188550949, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7576 + }, + { + "epoch": 6.051916932907348, + "grad_norm": 0.07882412523031235, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7577 + }, + { + "epoch": 6.052715654952077, + "grad_norm": 0.055492956191301346, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7578 + }, + { + "epoch": 6.053514376996805, + "grad_norm": 0.10246025770902634, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7579 + }, + { + "epoch": 6.054313099041534, + "grad_norm": 0.11067861318588257, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7580 + }, + { + "epoch": 6.055111821086262, + "grad_norm": 0.06063758581876755, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7581 + }, + { + "epoch": 6.055910543130991, + "grad_norm": 0.06848330795764923, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 7582 + }, + { + "epoch": 6.056709265175719, + "grad_norm": 0.10336993634700775, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7583 + }, + { + "epoch": 6.057507987220447, + "grad_norm": 0.06081530824303627, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7584 + }, + { + "epoch": 6.0583067092651754, + "grad_norm": 0.08049804717302322, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7585 + }, + { + "epoch": 6.059105431309904, + "grad_norm": 0.09174875915050507, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7586 + }, + { + "epoch": 6.0599041533546325, + "grad_norm": 0.06121581420302391, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7587 + }, + { + "epoch": 6.060702875399361, + "grad_norm": 0.10653077065944672, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7588 + }, + { + "epoch": 6.06150159744409, + "grad_norm": 0.0676097571849823, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7589 + }, + { + "epoch": 6.062300319488818, + "grad_norm": 0.0625678300857544, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7590 + }, + { + "epoch": 6.063099041533547, + "grad_norm": 0.07936695963144302, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7591 + }, + { + "epoch": 6.063897763578275, + "grad_norm": 0.06149541214108467, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7592 + }, + { + "epoch": 6.064696485623003, + "grad_norm": 0.04549092426896095, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7593 + }, + { + "epoch": 6.065495207667731, + "grad_norm": 0.06483953446149826, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7594 + }, + { + "epoch": 6.06629392971246, + "grad_norm": 0.04048188030719757, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7595 + }, + { + "epoch": 6.067092651757188, + "grad_norm": 0.038281429558992386, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7596 + }, + { + "epoch": 6.067891373801917, + "grad_norm": 0.06686673313379288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7597 + }, + { + "epoch": 6.068690095846645, + "grad_norm": 0.09025852382183075, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7598 + }, + { + "epoch": 6.069488817891374, + "grad_norm": 0.07517793774604797, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7599 + }, + { + "epoch": 6.0702875399361025, + "grad_norm": 0.06342573463916779, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7600 + }, + { + "epoch": 6.071086261980831, + "grad_norm": 0.08630760759115219, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7601 + }, + { + "epoch": 6.0718849840255595, + "grad_norm": 0.06443625688552856, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7602 + }, + { + "epoch": 6.072683706070287, + "grad_norm": 0.08748311549425125, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7603 + }, + { + "epoch": 6.073482428115016, + "grad_norm": 0.051623452454805374, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7604 + }, + { + "epoch": 6.074281150159744, + "grad_norm": 0.09098891913890839, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7605 + }, + { + "epoch": 6.075079872204473, + "grad_norm": 0.14741428196430206, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7606 + }, + { + "epoch": 6.075878594249201, + "grad_norm": 0.064545176923275, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7607 + }, + { + "epoch": 6.07667731629393, + "grad_norm": 0.09775100648403168, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7608 + }, + { + "epoch": 6.077476038338658, + "grad_norm": 0.14192643761634827, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7609 + }, + { + "epoch": 6.078274760383387, + "grad_norm": 0.05390379950404167, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7610 + }, + { + "epoch": 6.079073482428115, + "grad_norm": 0.35628536343574524, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7611 + }, + { + "epoch": 6.079872204472843, + "grad_norm": 0.11727920919656754, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7612 + }, + { + "epoch": 6.080670926517572, + "grad_norm": 0.053165338933467865, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7613 + }, + { + "epoch": 6.0814696485623, + "grad_norm": 0.12718519568443298, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7614 + }, + { + "epoch": 6.082268370607029, + "grad_norm": 0.12406741827726364, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7615 + }, + { + "epoch": 6.083067092651757, + "grad_norm": 0.05323740839958191, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7616 + }, + { + "epoch": 6.083865814696486, + "grad_norm": 0.09811960160732269, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7617 + }, + { + "epoch": 6.084664536741214, + "grad_norm": 0.12453506886959076, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7618 + }, + { + "epoch": 6.085463258785943, + "grad_norm": 0.13459496200084686, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7619 + }, + { + "epoch": 6.086261980830671, + "grad_norm": 0.20130378007888794, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7620 + }, + { + "epoch": 6.0870607028754, + "grad_norm": 0.11361974477767944, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7621 + }, + { + "epoch": 6.087859424920127, + "grad_norm": 0.07432135194540024, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7622 + }, + { + "epoch": 6.088658146964856, + "grad_norm": 0.14522314071655273, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7623 + }, + { + "epoch": 6.0894568690095845, + "grad_norm": 0.050937261432409286, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7624 + }, + { + "epoch": 6.090255591054313, + "grad_norm": 0.12386021763086319, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7625 + }, + { + "epoch": 6.0910543130990416, + "grad_norm": 0.1498231738805771, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7626 + }, + { + "epoch": 6.09185303514377, + "grad_norm": 0.042041294276714325, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7627 + }, + { + "epoch": 6.092651757188499, + "grad_norm": 0.1103961393237114, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7628 + }, + { + "epoch": 6.093450479233227, + "grad_norm": 0.12362606078386307, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7629 + }, + { + "epoch": 6.094249201277956, + "grad_norm": 0.07069346308708191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7630 + }, + { + "epoch": 6.095047923322683, + "grad_norm": 0.1306593418121338, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7631 + }, + { + "epoch": 6.095846645367412, + "grad_norm": 0.11293961852788925, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7632 + }, + { + "epoch": 6.09664536741214, + "grad_norm": 0.07145176827907562, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7633 + }, + { + "epoch": 6.097444089456869, + "grad_norm": 0.11122562736272812, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7634 + }, + { + "epoch": 6.098242811501597, + "grad_norm": 0.039713576436042786, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7635 + }, + { + "epoch": 6.099041533546326, + "grad_norm": 0.11573004722595215, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7636 + }, + { + "epoch": 6.0998402555910545, + "grad_norm": 0.11995833367109299, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7637 + }, + { + "epoch": 6.100638977635783, + "grad_norm": 0.03895663470029831, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7638 + }, + { + "epoch": 6.1014376996805115, + "grad_norm": 0.11274216324090958, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7639 + }, + { + "epoch": 6.102236421725239, + "grad_norm": 0.14242613315582275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7640 + }, + { + "epoch": 6.103035143769968, + "grad_norm": 0.04954848438501358, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7641 + }, + { + "epoch": 6.103833865814696, + "grad_norm": 0.10814809799194336, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7642 + }, + { + "epoch": 6.104632587859425, + "grad_norm": 0.11696363240480423, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7643 + }, + { + "epoch": 6.105431309904153, + "grad_norm": 0.04597959294915199, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7644 + }, + { + "epoch": 6.106230031948882, + "grad_norm": 0.16304457187652588, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7645 + }, + { + "epoch": 6.10702875399361, + "grad_norm": 0.14835208654403687, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7646 + }, + { + "epoch": 6.107827476038339, + "grad_norm": 0.06062949076294899, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7647 + }, + { + "epoch": 6.108626198083067, + "grad_norm": 0.1033453568816185, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7648 + }, + { + "epoch": 6.109424920127796, + "grad_norm": 0.14823280274868011, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7649 + }, + { + "epoch": 6.110223642172524, + "grad_norm": 0.18282924592494965, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7650 + }, + { + "epoch": 6.111022364217252, + "grad_norm": 0.17962203919887543, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7651 + }, + { + "epoch": 6.111821086261981, + "grad_norm": 0.12176015228033066, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7652 + }, + { + "epoch": 6.112619808306709, + "grad_norm": 0.07326921075582504, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7653 + }, + { + "epoch": 6.113418530351438, + "grad_norm": 0.24457645416259766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7654 + }, + { + "epoch": 6.114217252396166, + "grad_norm": 0.1442916989326477, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7655 + }, + { + "epoch": 6.115015974440895, + "grad_norm": 0.0716436356306076, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7656 + }, + { + "epoch": 6.115814696485623, + "grad_norm": 0.20782648026943207, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7657 + }, + { + "epoch": 6.116613418530352, + "grad_norm": 0.1183728352189064, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7658 + }, + { + "epoch": 6.11741214057508, + "grad_norm": 0.13251493871212006, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7659 + }, + { + "epoch": 6.118210862619808, + "grad_norm": 0.21223802864551544, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7660 + }, + { + "epoch": 6.1190095846645365, + "grad_norm": 0.0811460018157959, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7661 + }, + { + "epoch": 6.119808306709265, + "grad_norm": 0.13528718054294586, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7662 + }, + { + "epoch": 6.1206070287539935, + "grad_norm": 0.11806038022041321, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7663 + }, + { + "epoch": 6.121405750798722, + "grad_norm": 0.10022544860839844, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7664 + }, + { + "epoch": 6.122204472843451, + "grad_norm": 0.21452540159225464, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7665 + }, + { + "epoch": 6.123003194888179, + "grad_norm": 0.11949847638607025, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7666 + }, + { + "epoch": 6.123801916932908, + "grad_norm": 0.12636634707450867, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7667 + }, + { + "epoch": 6.124600638977636, + "grad_norm": 0.17132572829723358, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7668 + }, + { + "epoch": 6.125399361022364, + "grad_norm": 0.1116800457239151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7669 + }, + { + "epoch": 6.126198083067092, + "grad_norm": 0.13965120911598206, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7670 + }, + { + "epoch": 6.126996805111821, + "grad_norm": 0.1346610188484192, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7671 + }, + { + "epoch": 6.127795527156549, + "grad_norm": 0.07977228611707687, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7672 + }, + { + "epoch": 6.128594249201278, + "grad_norm": 0.21412506699562073, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7673 + }, + { + "epoch": 6.1293929712460065, + "grad_norm": 0.172305628657341, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7674 + }, + { + "epoch": 6.130191693290735, + "grad_norm": 0.10782980173826218, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7675 + }, + { + "epoch": 6.1309904153354635, + "grad_norm": 0.23166432976722717, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7676 + }, + { + "epoch": 6.131789137380192, + "grad_norm": 0.12337028980255127, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7677 + }, + { + "epoch": 6.13258785942492, + "grad_norm": 0.11406251043081284, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7678 + }, + { + "epoch": 6.133386581469648, + "grad_norm": 0.19163282215595245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7679 + }, + { + "epoch": 6.134185303514377, + "grad_norm": 0.06671248376369476, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7680 + }, + { + "epoch": 6.134984025559105, + "grad_norm": 0.13190557062625885, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7681 + }, + { + "epoch": 6.135782747603834, + "grad_norm": 0.20761321485042572, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7682 + }, + { + "epoch": 6.136581469648562, + "grad_norm": 0.08118047565221786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7683 + }, + { + "epoch": 6.137380191693291, + "grad_norm": 0.1458984613418579, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7684 + }, + { + "epoch": 6.138178913738019, + "grad_norm": 0.1305929571390152, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7685 + }, + { + "epoch": 6.138977635782748, + "grad_norm": 0.0972108244895935, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7686 + }, + { + "epoch": 6.139776357827476, + "grad_norm": 0.14246216416358948, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7687 + }, + { + "epoch": 6.140575079872204, + "grad_norm": 0.04341820999979973, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7688 + }, + { + "epoch": 6.141373801916933, + "grad_norm": 0.127020001411438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7689 + }, + { + "epoch": 6.142172523961661, + "grad_norm": 0.08494339138269424, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7690 + }, + { + "epoch": 6.14297124600639, + "grad_norm": 0.11377454549074173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7691 + }, + { + "epoch": 6.143769968051118, + "grad_norm": 0.13752779364585876, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7692 + }, + { + "epoch": 6.144568690095847, + "grad_norm": 0.054878801107406616, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7693 + }, + { + "epoch": 6.145367412140575, + "grad_norm": 0.11313790827989578, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7694 + }, + { + "epoch": 6.146166134185304, + "grad_norm": 0.04388728365302086, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7695 + }, + { + "epoch": 6.146964856230032, + "grad_norm": 0.12842994928359985, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7696 + }, + { + "epoch": 6.147763578274761, + "grad_norm": 0.1374971568584442, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7697 + }, + { + "epoch": 6.1485623003194885, + "grad_norm": 0.1082429438829422, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7698 + }, + { + "epoch": 6.149361022364217, + "grad_norm": 0.14329178631305695, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7699 + }, + { + "epoch": 6.1501597444089455, + "grad_norm": 0.07794678211212158, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7700 + }, + { + "epoch": 6.150958466453674, + "grad_norm": 0.10680928826332092, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7701 + }, + { + "epoch": 6.151757188498403, + "grad_norm": 0.11628691852092743, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7702 + }, + { + "epoch": 6.152555910543131, + "grad_norm": 0.03565143793821335, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7703 + }, + { + "epoch": 6.15335463258786, + "grad_norm": 0.10634133219718933, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7704 + }, + { + "epoch": 6.154153354632588, + "grad_norm": 0.10307054221630096, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7705 + }, + { + "epoch": 6.154952076677317, + "grad_norm": 0.05591967701911926, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7706 + }, + { + "epoch": 6.155750798722044, + "grad_norm": 0.07205721735954285, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7707 + }, + { + "epoch": 6.156549520766773, + "grad_norm": 0.05020968243479729, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7708 + }, + { + "epoch": 6.157348242811501, + "grad_norm": 0.037087470293045044, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7709 + }, + { + "epoch": 6.15814696485623, + "grad_norm": 0.06322529166936874, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7710 + }, + { + "epoch": 6.1589456869009584, + "grad_norm": 0.03881093114614487, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7711 + }, + { + "epoch": 6.159744408945687, + "grad_norm": 0.06219052895903587, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7712 + }, + { + "epoch": 6.1605431309904155, + "grad_norm": 0.043313659727573395, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7713 + }, + { + "epoch": 6.161341853035144, + "grad_norm": 0.05460439994931221, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7714 + }, + { + "epoch": 6.162140575079873, + "grad_norm": 0.045017000287771225, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7715 + }, + { + "epoch": 6.1629392971246, + "grad_norm": 0.08029863983392715, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7716 + }, + { + "epoch": 6.163738019169329, + "grad_norm": 0.06935936212539673, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7717 + }, + { + "epoch": 6.164536741214057, + "grad_norm": 0.12617695331573486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7718 + }, + { + "epoch": 6.165335463258786, + "grad_norm": 0.09746283292770386, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7719 + }, + { + "epoch": 6.166134185303514, + "grad_norm": 0.038731649518013, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7720 + }, + { + "epoch": 6.166932907348243, + "grad_norm": 0.1054256334900856, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7721 + }, + { + "epoch": 6.167731629392971, + "grad_norm": 0.0833977535367012, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7722 + }, + { + "epoch": 6.1685303514377, + "grad_norm": 1.3529000282287598, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7723 + }, + { + "epoch": 6.169329073482428, + "grad_norm": 0.06748781353235245, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7724 + }, + { + "epoch": 6.170127795527157, + "grad_norm": 0.06015792861580849, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7725 + }, + { + "epoch": 6.170926517571885, + "grad_norm": 0.07760192453861237, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7726 + }, + { + "epoch": 6.171725239616613, + "grad_norm": 0.09536328911781311, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7727 + }, + { + "epoch": 6.172523961661342, + "grad_norm": 0.051248203963041306, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7728 + }, + { + "epoch": 6.17332268370607, + "grad_norm": 0.09610000252723694, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7729 + }, + { + "epoch": 6.174121405750799, + "grad_norm": 0.0803515687584877, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7730 + }, + { + "epoch": 6.174920127795527, + "grad_norm": 0.0820179283618927, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7731 + }, + { + "epoch": 6.175718849840256, + "grad_norm": 0.08880780637264252, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7732 + }, + { + "epoch": 6.176517571884984, + "grad_norm": 0.12188591808080673, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7733 + }, + { + "epoch": 6.177316293929713, + "grad_norm": 0.06245967745780945, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7734 + }, + { + "epoch": 6.178115015974441, + "grad_norm": 0.06608586013317108, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7735 + }, + { + "epoch": 6.178913738019169, + "grad_norm": 0.08542132377624512, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7736 + }, + { + "epoch": 6.1797124600638975, + "grad_norm": 0.06510723382234573, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7737 + }, + { + "epoch": 6.180511182108626, + "grad_norm": 0.161012202501297, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7738 + }, + { + "epoch": 6.181309904153355, + "grad_norm": 0.07943159341812134, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7739 + }, + { + "epoch": 6.182108626198083, + "grad_norm": 0.07735269516706467, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7740 + }, + { + "epoch": 6.182907348242812, + "grad_norm": 0.07452470809221268, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7741 + }, + { + "epoch": 6.18370607028754, + "grad_norm": 0.06378357857465744, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7742 + }, + { + "epoch": 6.184504792332269, + "grad_norm": 0.06149968132376671, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7743 + }, + { + "epoch": 6.185303514376997, + "grad_norm": 0.06558738648891449, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7744 + }, + { + "epoch": 6.186102236421725, + "grad_norm": 0.06004631146788597, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7745 + }, + { + "epoch": 6.186900958466453, + "grad_norm": 0.09972328692674637, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7746 + }, + { + "epoch": 6.187699680511182, + "grad_norm": 0.059344276785850525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7747 + }, + { + "epoch": 6.18849840255591, + "grad_norm": 0.15083496272563934, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7748 + }, + { + "epoch": 6.189297124600639, + "grad_norm": 0.08041606843471527, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7749 + }, + { + "epoch": 6.1900958466453675, + "grad_norm": 0.0801318883895874, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7750 + }, + { + "epoch": 6.190894568690096, + "grad_norm": 0.13313926756381989, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7751 + }, + { + "epoch": 6.1916932907348246, + "grad_norm": 0.07887420803308487, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7752 + }, + { + "epoch": 6.192492012779553, + "grad_norm": 0.08653397113084793, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7753 + }, + { + "epoch": 6.193290734824281, + "grad_norm": 0.12184617668390274, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7754 + }, + { + "epoch": 6.194089456869009, + "grad_norm": 0.05356535315513611, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7755 + }, + { + "epoch": 6.194888178913738, + "grad_norm": 0.09529519081115723, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7756 + }, + { + "epoch": 6.195686900958466, + "grad_norm": 0.07658126950263977, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7757 + }, + { + "epoch": 6.196485623003195, + "grad_norm": 0.0785149484872818, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7758 + }, + { + "epoch": 6.197284345047923, + "grad_norm": 0.10748651623725891, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7759 + }, + { + "epoch": 6.198083067092652, + "grad_norm": 0.056907687336206436, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7760 + }, + { + "epoch": 6.19888178913738, + "grad_norm": 0.3713622987270355, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7761 + }, + { + "epoch": 6.199680511182109, + "grad_norm": 0.16671019792556763, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7762 + }, + { + "epoch": 6.2004792332268375, + "grad_norm": 0.10214395076036453, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7763 + }, + { + "epoch": 6.201277955271565, + "grad_norm": 0.09181013703346252, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7764 + }, + { + "epoch": 6.202076677316294, + "grad_norm": 0.18003405630588531, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7765 + }, + { + "epoch": 6.202875399361022, + "grad_norm": 0.1032429188489914, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7766 + }, + { + "epoch": 6.203674121405751, + "grad_norm": 0.06787005811929703, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7767 + }, + { + "epoch": 6.204472843450479, + "grad_norm": 0.09422674775123596, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7768 + }, + { + "epoch": 6.205271565495208, + "grad_norm": 0.04083932563662529, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7769 + }, + { + "epoch": 6.206070287539936, + "grad_norm": 0.1368017941713333, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7770 + }, + { + "epoch": 6.206869009584665, + "grad_norm": 0.23276877403259277, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7771 + }, + { + "epoch": 6.207667731629393, + "grad_norm": 0.13092860579490662, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7772 + }, + { + "epoch": 6.208466453674121, + "grad_norm": 0.14030441641807556, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7773 + }, + { + "epoch": 6.2092651757188495, + "grad_norm": 0.2016047090291977, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7774 + }, + { + "epoch": 6.210063897763578, + "grad_norm": 0.1224871277809143, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7775 + }, + { + "epoch": 6.210862619808307, + "grad_norm": 0.10741977393627167, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7776 + }, + { + "epoch": 6.211661341853035, + "grad_norm": 0.19775021076202393, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7777 + }, + { + "epoch": 6.212460063897764, + "grad_norm": 0.06731278449296951, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7778 + }, + { + "epoch": 6.213258785942492, + "grad_norm": 0.14070862531661987, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7779 + }, + { + "epoch": 6.214057507987221, + "grad_norm": 0.1267949938774109, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7780 + }, + { + "epoch": 6.214856230031949, + "grad_norm": 0.0694371834397316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7781 + }, + { + "epoch": 6.215654952076678, + "grad_norm": 0.12222267687320709, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7782 + }, + { + "epoch": 6.216453674121405, + "grad_norm": 0.1105445921421051, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7783 + }, + { + "epoch": 6.217252396166134, + "grad_norm": 0.05993608012795448, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7784 + }, + { + "epoch": 6.218051118210862, + "grad_norm": 0.11157821118831635, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7785 + }, + { + "epoch": 6.218849840255591, + "grad_norm": 0.05242336913943291, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7786 + }, + { + "epoch": 6.2196485623003195, + "grad_norm": 0.046115025877952576, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7787 + }, + { + "epoch": 6.220447284345048, + "grad_norm": 0.04029909893870354, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7788 + }, + { + "epoch": 6.2212460063897765, + "grad_norm": 0.057172924280166626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7789 + }, + { + "epoch": 6.222044728434505, + "grad_norm": 0.04958837479352951, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7790 + }, + { + "epoch": 6.222843450479234, + "grad_norm": 0.046313852071762085, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7791 + }, + { + "epoch": 6.223642172523961, + "grad_norm": 0.03824630752205849, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7792 + }, + { + "epoch": 6.22444089456869, + "grad_norm": 0.07159019261598587, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7793 + }, + { + "epoch": 6.225239616613418, + "grad_norm": 0.06316389888525009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7794 + }, + { + "epoch": 6.226038338658147, + "grad_norm": 0.088447704911232, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7795 + }, + { + "epoch": 6.226837060702875, + "grad_norm": 0.08749943226575851, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7796 + }, + { + "epoch": 6.227635782747604, + "grad_norm": 0.08757520467042923, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7797 + }, + { + "epoch": 6.228434504792332, + "grad_norm": 0.10777202993631363, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7798 + }, + { + "epoch": 6.229233226837061, + "grad_norm": 0.15780584514141083, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7799 + }, + { + "epoch": 6.2300319488817895, + "grad_norm": 0.10375814139842987, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7800 + }, + { + "epoch": 6.230830670926518, + "grad_norm": 0.3544321656227112, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7801 + }, + { + "epoch": 6.231629392971246, + "grad_norm": 0.11117644608020782, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7802 + }, + { + "epoch": 6.232428115015974, + "grad_norm": 0.13096286356449127, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7803 + }, + { + "epoch": 6.233226837060703, + "grad_norm": 0.2706630229949951, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7804 + }, + { + "epoch": 6.234025559105431, + "grad_norm": 0.05805981904268265, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7805 + }, + { + "epoch": 6.23482428115016, + "grad_norm": 0.14731241762638092, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7806 + }, + { + "epoch": 6.235623003194888, + "grad_norm": 0.08912478387355804, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7807 + }, + { + "epoch": 6.236421725239617, + "grad_norm": 0.15754206478595734, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7808 + }, + { + "epoch": 6.237220447284345, + "grad_norm": 0.21143318712711334, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7809 + }, + { + "epoch": 6.238019169329074, + "grad_norm": 0.11839418858289719, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7810 + }, + { + "epoch": 6.2388178913738015, + "grad_norm": 0.23939856886863708, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7811 + }, + { + "epoch": 6.23961661341853, + "grad_norm": 0.1438305526971817, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7812 + }, + { + "epoch": 6.2404153354632586, + "grad_norm": 0.11111237108707428, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7813 + }, + { + "epoch": 6.241214057507987, + "grad_norm": 0.19577394425868988, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7814 + }, + { + "epoch": 6.242012779552716, + "grad_norm": 0.1399260312318802, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7815 + }, + { + "epoch": 6.242811501597444, + "grad_norm": 0.16393627226352692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7816 + }, + { + "epoch": 6.243610223642173, + "grad_norm": 0.15071940422058105, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7817 + }, + { + "epoch": 6.244408945686901, + "grad_norm": 0.2121957242488861, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7818 + }, + { + "epoch": 6.24520766773163, + "grad_norm": 0.09854442626237869, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7819 + }, + { + "epoch": 6.246006389776358, + "grad_norm": 0.1327667534351349, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7820 + }, + { + "epoch": 6.246805111821086, + "grad_norm": 0.13909243047237396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7821 + }, + { + "epoch": 6.247603833865814, + "grad_norm": 0.08482292294502258, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7822 + }, + { + "epoch": 6.248402555910543, + "grad_norm": 0.0918656438589096, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7823 + }, + { + "epoch": 6.2492012779552715, + "grad_norm": 0.1352611631155014, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7824 + }, + { + "epoch": 6.25, + "grad_norm": 0.06178867816925049, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7825 + }, + { + "epoch": 6.2507987220447285, + "grad_norm": 0.1285342425107956, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7826 + }, + { + "epoch": 6.251597444089457, + "grad_norm": 0.17862951755523682, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7827 + }, + { + "epoch": 6.252396166134186, + "grad_norm": 0.574928343296051, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7828 + }, + { + "epoch": 6.253194888178914, + "grad_norm": 0.11522867530584335, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 7829 + }, + { + "epoch": 6.253993610223642, + "grad_norm": 0.08348001539707184, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7830 + }, + { + "epoch": 6.25479233226837, + "grad_norm": 0.1015007346868515, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7831 + }, + { + "epoch": 6.255591054313099, + "grad_norm": 0.18213561177253723, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7832 + }, + { + "epoch": 6.256389776357827, + "grad_norm": 0.1056833565235138, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7833 + }, + { + "epoch": 6.257188498402556, + "grad_norm": 0.09715890139341354, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7834 + }, + { + "epoch": 6.257987220447284, + "grad_norm": 0.17651355266571045, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7835 + }, + { + "epoch": 6.258785942492013, + "grad_norm": 0.11858265846967697, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7836 + }, + { + "epoch": 6.2595846645367414, + "grad_norm": 0.1400168240070343, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7837 + }, + { + "epoch": 6.26038338658147, + "grad_norm": 0.2133244276046753, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7838 + }, + { + "epoch": 6.261182108626198, + "grad_norm": 0.087309330701828, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7839 + }, + { + "epoch": 6.261980830670926, + "grad_norm": 0.07735110074281693, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7840 + }, + { + "epoch": 6.262779552715655, + "grad_norm": 0.08314932882785797, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7841 + }, + { + "epoch": 6.263578274760383, + "grad_norm": 0.13448217511177063, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7842 + }, + { + "epoch": 6.264376996805112, + "grad_norm": 1.4022712707519531, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7843 + }, + { + "epoch": 6.26517571884984, + "grad_norm": 0.1107354387640953, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7844 + }, + { + "epoch": 6.265974440894569, + "grad_norm": 0.17282478511333466, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7845 + }, + { + "epoch": 6.266773162939297, + "grad_norm": 0.0903516560792923, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7846 + }, + { + "epoch": 6.267571884984026, + "grad_norm": 0.07628770172595978, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7847 + }, + { + "epoch": 6.268370607028754, + "grad_norm": 0.08877440541982651, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7848 + }, + { + "epoch": 6.269169329073483, + "grad_norm": 0.041159700602293015, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7849 + }, + { + "epoch": 6.2699680511182105, + "grad_norm": 0.09187504649162292, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7850 + }, + { + "epoch": 6.270766773162939, + "grad_norm": 0.11252478510141373, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7851 + }, + { + "epoch": 6.271565495207668, + "grad_norm": 0.04354100301861763, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7852 + }, + { + "epoch": 6.272364217252396, + "grad_norm": 0.06845738738775253, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7853 + }, + { + "epoch": 6.273162939297125, + "grad_norm": 0.047235157340765, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7854 + }, + { + "epoch": 6.273961661341853, + "grad_norm": 0.04571741819381714, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7855 + }, + { + "epoch": 6.274760383386582, + "grad_norm": 0.09801016747951508, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7856 + }, + { + "epoch": 6.27555910543131, + "grad_norm": 0.12422922253608704, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7857 + }, + { + "epoch": 6.276357827476039, + "grad_norm": 0.07283129543066025, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7858 + }, + { + "epoch": 6.277156549520766, + "grad_norm": 0.07217510044574738, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7859 + }, + { + "epoch": 6.277955271565495, + "grad_norm": 0.1102033257484436, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7860 + }, + { + "epoch": 6.2787539936102235, + "grad_norm": 0.0814276710152626, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7861 + }, + { + "epoch": 6.279552715654952, + "grad_norm": 0.08247577399015427, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7862 + }, + { + "epoch": 6.2803514376996805, + "grad_norm": 0.04042622447013855, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7863 + }, + { + "epoch": 6.281150159744409, + "grad_norm": 0.049153268337249756, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7864 + }, + { + "epoch": 6.281948881789138, + "grad_norm": 0.07062675058841705, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7865 + }, + { + "epoch": 6.282747603833866, + "grad_norm": 0.06458686292171478, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7866 + }, + { + "epoch": 6.283546325878595, + "grad_norm": 0.093512162566185, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7867 + }, + { + "epoch": 6.284345047923322, + "grad_norm": 0.054384954273700714, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7868 + }, + { + "epoch": 6.285143769968051, + "grad_norm": 0.06253736466169357, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7869 + }, + { + "epoch": 6.285942492012779, + "grad_norm": 0.05566808953881264, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7870 + }, + { + "epoch": 6.286741214057508, + "grad_norm": 0.07693472504615784, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7871 + }, + { + "epoch": 6.287539936102236, + "grad_norm": 0.04471312463283539, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7872 + }, + { + "epoch": 6.288338658146965, + "grad_norm": 0.050770796835422516, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7873 + }, + { + "epoch": 6.289137380191693, + "grad_norm": 0.04736769199371338, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7874 + }, + { + "epoch": 6.289936102236422, + "grad_norm": 0.06550426036119461, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7875 + }, + { + "epoch": 6.2907348242811505, + "grad_norm": 0.0524384006857872, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7876 + }, + { + "epoch": 6.291533546325878, + "grad_norm": 0.10091802477836609, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7877 + }, + { + "epoch": 6.292332268370607, + "grad_norm": 0.14296530187129974, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7878 + }, + { + "epoch": 6.293130990415335, + "grad_norm": 0.08703069388866425, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7879 + }, + { + "epoch": 6.293929712460064, + "grad_norm": 0.05628393217921257, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7880 + }, + { + "epoch": 6.294728434504792, + "grad_norm": 0.09164825826883316, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7881 + }, + { + "epoch": 6.295527156549521, + "grad_norm": 0.09182474762201309, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7882 + }, + { + "epoch": 6.296325878594249, + "grad_norm": 0.03495810180902481, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7883 + }, + { + "epoch": 6.297124600638978, + "grad_norm": 0.07738466560840607, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7884 + }, + { + "epoch": 6.297923322683706, + "grad_norm": 0.06034242361783981, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7885 + }, + { + "epoch": 6.298722044728435, + "grad_norm": 0.04083844646811485, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7886 + }, + { + "epoch": 6.2995207667731625, + "grad_norm": 0.0918336734175682, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7887 + }, + { + "epoch": 6.300319488817891, + "grad_norm": 0.07351864874362946, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7888 + }, + { + "epoch": 6.30111821086262, + "grad_norm": 0.042986564338207245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7889 + }, + { + "epoch": 6.301916932907348, + "grad_norm": 0.05983031541109085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7890 + }, + { + "epoch": 6.302715654952077, + "grad_norm": 0.10980594903230667, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7891 + }, + { + "epoch": 6.303514376996805, + "grad_norm": 0.04517138749361038, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7892 + }, + { + "epoch": 6.304313099041534, + "grad_norm": 0.08489427715539932, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7893 + }, + { + "epoch": 6.305111821086262, + "grad_norm": 0.040421262383461, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7894 + }, + { + "epoch": 6.305910543130991, + "grad_norm": 0.0438009649515152, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7895 + }, + { + "epoch": 6.306709265175719, + "grad_norm": 0.05797100067138672, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7896 + }, + { + "epoch": 6.307507987220447, + "grad_norm": 0.08798980712890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7897 + }, + { + "epoch": 6.3083067092651754, + "grad_norm": 0.0502130500972271, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7898 + }, + { + "epoch": 6.309105431309904, + "grad_norm": 0.11610639840364456, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7899 + }, + { + "epoch": 6.3099041533546325, + "grad_norm": 0.061168819665908813, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7900 + }, + { + "epoch": 6.310702875399361, + "grad_norm": 0.0469425804913044, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7901 + }, + { + "epoch": 6.31150159744409, + "grad_norm": 0.0483059324324131, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7902 + }, + { + "epoch": 6.312300319488818, + "grad_norm": 0.120233453810215, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7903 + }, + { + "epoch": 6.313099041533547, + "grad_norm": 0.10025710612535477, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7904 + }, + { + "epoch": 6.313897763578275, + "grad_norm": 0.08750995993614197, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7905 + }, + { + "epoch": 6.314696485623003, + "grad_norm": 0.31308433413505554, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7906 + }, + { + "epoch": 6.315495207667731, + "grad_norm": 0.06390809267759323, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7907 + }, + { + "epoch": 6.31629392971246, + "grad_norm": 0.0657041072845459, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7908 + }, + { + "epoch": 6.317092651757188, + "grad_norm": 0.09626918286085129, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7909 + }, + { + "epoch": 6.317891373801917, + "grad_norm": 0.05565343424677849, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7910 + }, + { + "epoch": 6.318690095846645, + "grad_norm": 0.06147831678390503, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7911 + }, + { + "epoch": 6.319488817891374, + "grad_norm": 0.08704033493995667, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7912 + }, + { + "epoch": 6.3202875399361025, + "grad_norm": 0.04405020549893379, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7913 + }, + { + "epoch": 6.321086261980831, + "grad_norm": 0.07587708532810211, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7914 + }, + { + "epoch": 6.321884984025559, + "grad_norm": 0.05935811623930931, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7915 + }, + { + "epoch": 6.322683706070287, + "grad_norm": 0.045584313571453094, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7916 + }, + { + "epoch": 6.323482428115016, + "grad_norm": 0.065196193754673, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7917 + }, + { + "epoch": 6.324281150159744, + "grad_norm": 0.05996553227305412, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7918 + }, + { + "epoch": 6.325079872204473, + "grad_norm": 0.04771357774734497, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7919 + }, + { + "epoch": 6.325878594249201, + "grad_norm": 0.05875687673687935, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7920 + }, + { + "epoch": 6.32667731629393, + "grad_norm": 0.15765227377414703, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7921 + }, + { + "epoch": 6.327476038338658, + "grad_norm": 0.038563717156648636, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7922 + }, + { + "epoch": 6.328274760383387, + "grad_norm": 0.04321083426475525, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7923 + }, + { + "epoch": 6.329073482428115, + "grad_norm": 0.04427725449204445, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7924 + }, + { + "epoch": 6.329872204472843, + "grad_norm": 0.06047825515270233, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7925 + }, + { + "epoch": 6.330670926517572, + "grad_norm": 0.05161035805940628, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7926 + }, + { + "epoch": 6.3314696485623, + "grad_norm": 0.06512151658535004, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7927 + }, + { + "epoch": 6.332268370607029, + "grad_norm": 0.05178358778357506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7928 + }, + { + "epoch": 6.333067092651757, + "grad_norm": 0.06199260801076889, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7929 + }, + { + "epoch": 6.333865814696486, + "grad_norm": 0.09948168694972992, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7930 + }, + { + "epoch": 6.334664536741214, + "grad_norm": 0.06568150222301483, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7931 + }, + { + "epoch": 6.335463258785943, + "grad_norm": 0.036642882972955704, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7932 + }, + { + "epoch": 6.336261980830671, + "grad_norm": 0.04814688116312027, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7933 + }, + { + "epoch": 6.3370607028754, + "grad_norm": 0.03938854858279228, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7934 + }, + { + "epoch": 6.337859424920127, + "grad_norm": 0.07778320461511612, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7935 + }, + { + "epoch": 6.338658146964856, + "grad_norm": 0.16271090507507324, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7936 + }, + { + "epoch": 6.3394568690095845, + "grad_norm": 0.3652990460395813, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7937 + }, + { + "epoch": 6.340255591054313, + "grad_norm": 0.0592365525662899, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7938 + }, + { + "epoch": 6.3410543130990416, + "grad_norm": 0.28622883558273315, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7939 + }, + { + "epoch": 6.34185303514377, + "grad_norm": 0.2270730584859848, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7940 + }, + { + "epoch": 6.342651757188499, + "grad_norm": 0.10781756043434143, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7941 + }, + { + "epoch": 6.343450479233227, + "grad_norm": 0.11611706018447876, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7942 + }, + { + "epoch": 6.344249201277956, + "grad_norm": 0.08212626725435257, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7943 + }, + { + "epoch": 6.345047923322683, + "grad_norm": 0.0739196389913559, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7944 + }, + { + "epoch": 6.345846645367412, + "grad_norm": 0.1029743030667305, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7945 + }, + { + "epoch": 6.34664536741214, + "grad_norm": 0.2787686586380005, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7946 + }, + { + "epoch": 6.347444089456869, + "grad_norm": 0.12180152535438538, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7947 + }, + { + "epoch": 6.348242811501597, + "grad_norm": 0.178681880235672, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7948 + }, + { + "epoch": 6.349041533546326, + "grad_norm": 0.10219722986221313, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7949 + }, + { + "epoch": 6.3498402555910545, + "grad_norm": 0.0773158147931099, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7950 + }, + { + "epoch": 6.350638977635783, + "grad_norm": 0.15096192061901093, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7951 + }, + { + "epoch": 6.3514376996805115, + "grad_norm": 0.06237277388572693, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7952 + }, + { + "epoch": 6.352236421725239, + "grad_norm": 1.4819257259368896, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7953 + }, + { + "epoch": 6.353035143769968, + "grad_norm": 0.09716464579105377, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7954 + }, + { + "epoch": 6.353833865814696, + "grad_norm": 0.10105668753385544, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7955 + }, + { + "epoch": 6.354632587859425, + "grad_norm": 0.09361526370048523, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7956 + }, + { + "epoch": 6.355431309904153, + "grad_norm": 0.04209212213754654, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7957 + }, + { + "epoch": 6.356230031948882, + "grad_norm": 0.11653190106153488, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7958 + }, + { + "epoch": 6.35702875399361, + "grad_norm": 0.1552112102508545, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7959 + }, + { + "epoch": 6.357827476038339, + "grad_norm": 0.07934660464525223, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7960 + }, + { + "epoch": 6.358626198083067, + "grad_norm": 0.10928693413734436, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7961 + }, + { + "epoch": 6.359424920127796, + "grad_norm": 0.15923380851745605, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7962 + }, + { + "epoch": 6.360223642172524, + "grad_norm": 0.12151104211807251, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7963 + }, + { + "epoch": 6.361022364217252, + "grad_norm": 0.055971868336200714, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7964 + }, + { + "epoch": 6.361821086261981, + "grad_norm": 0.17611366510391235, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7965 + }, + { + "epoch": 6.362619808306709, + "grad_norm": 0.16098986566066742, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7966 + }, + { + "epoch": 6.363418530351438, + "grad_norm": 1.6793769598007202, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7967 + }, + { + "epoch": 6.364217252396166, + "grad_norm": 0.4322223365306854, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 7968 + }, + { + "epoch": 6.365015974440895, + "grad_norm": 0.35510173439979553, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 7969 + }, + { + "epoch": 6.365814696485623, + "grad_norm": 0.08799898624420166, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7970 + }, + { + "epoch": 6.366613418530352, + "grad_norm": 0.28774675726890564, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7971 + }, + { + "epoch": 6.36741214057508, + "grad_norm": 0.28109011054039, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7972 + }, + { + "epoch": 6.368210862619808, + "grad_norm": 0.09055986255407333, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7973 + }, + { + "epoch": 6.3690095846645365, + "grad_norm": 0.15083353221416473, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7974 + }, + { + "epoch": 6.369808306709265, + "grad_norm": 0.20686668157577515, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7975 + }, + { + "epoch": 6.3706070287539935, + "grad_norm": 0.047575660049915314, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 7976 + }, + { + "epoch": 6.371405750798722, + "grad_norm": 0.25424477458000183, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7977 + }, + { + "epoch": 6.372204472843451, + "grad_norm": 0.21839222311973572, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7978 + }, + { + "epoch": 6.373003194888179, + "grad_norm": 0.06493431329727173, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7979 + }, + { + "epoch": 6.373801916932908, + "grad_norm": 0.2369518280029297, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 7980 + }, + { + "epoch": 6.374600638977636, + "grad_norm": 0.14641214907169342, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7981 + }, + { + "epoch": 6.375399361022364, + "grad_norm": 0.11602997034788132, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7982 + }, + { + "epoch": 6.376198083067092, + "grad_norm": 0.18792425096035004, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7983 + }, + { + "epoch": 6.376996805111821, + "grad_norm": 0.06824373453855515, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7984 + }, + { + "epoch": 6.377795527156549, + "grad_norm": 0.1228032335639, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7985 + }, + { + "epoch": 6.378594249201278, + "grad_norm": 0.15771286189556122, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7986 + }, + { + "epoch": 6.3793929712460065, + "grad_norm": 0.1157795861363411, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7987 + }, + { + "epoch": 6.380191693290735, + "grad_norm": 0.07282877713441849, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7988 + }, + { + "epoch": 6.3809904153354635, + "grad_norm": 0.10168643295764923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7989 + }, + { + "epoch": 6.381789137380192, + "grad_norm": 0.24466580152511597, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7990 + }, + { + "epoch": 6.38258785942492, + "grad_norm": 0.0972297191619873, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7991 + }, + { + "epoch": 6.383386581469648, + "grad_norm": 0.08349917083978653, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7992 + }, + { + "epoch": 6.384185303514377, + "grad_norm": 0.058114584535360336, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7993 + }, + { + "epoch": 6.384984025559105, + "grad_norm": 0.04745171591639519, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7994 + }, + { + "epoch": 6.385782747603834, + "grad_norm": 0.05484034866094589, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7995 + }, + { + "epoch": 6.386581469648562, + "grad_norm": 0.05094960704445839, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7996 + }, + { + "epoch": 6.387380191693291, + "grad_norm": 0.06368618458509445, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7997 + }, + { + "epoch": 6.388178913738019, + "grad_norm": 0.07042541354894638, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7998 + }, + { + "epoch": 6.388977635782748, + "grad_norm": 0.06182365491986275, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7999 + }, + { + "epoch": 6.389776357827476, + "grad_norm": 0.05778853967785835, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8000 + }, + { + "epoch": 6.390575079872204, + "grad_norm": 0.04334365949034691, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8001 + }, + { + "epoch": 6.391373801916933, + "grad_norm": 0.08214148133993149, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8002 + }, + { + "epoch": 6.392172523961661, + "grad_norm": 0.05468964949250221, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8003 + }, + { + "epoch": 6.39297124600639, + "grad_norm": 0.07484348863363266, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8004 + }, + { + "epoch": 6.393769968051118, + "grad_norm": 0.04987887665629387, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8005 + }, + { + "epoch": 6.394568690095847, + "grad_norm": 0.05584597587585449, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8006 + }, + { + "epoch": 6.395367412140575, + "grad_norm": 0.07088904082775116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8007 + }, + { + "epoch": 6.396166134185304, + "grad_norm": 0.26695576310157776, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8008 + }, + { + "epoch": 6.396964856230032, + "grad_norm": 0.06452658027410507, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8009 + }, + { + "epoch": 6.397763578274761, + "grad_norm": 0.08994145691394806, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8010 + }, + { + "epoch": 6.3985623003194885, + "grad_norm": 0.06565240770578384, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8011 + }, + { + "epoch": 6.399361022364217, + "grad_norm": 0.0492648184299469, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8012 + }, + { + "epoch": 6.4001597444089455, + "grad_norm": 0.06946985423564911, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8013 + }, + { + "epoch": 6.400958466453674, + "grad_norm": 0.08669331669807434, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8014 + }, + { + "epoch": 6.401757188498403, + "grad_norm": 0.07930289953947067, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8015 + }, + { + "epoch": 6.402555910543131, + "grad_norm": 0.15216746926307678, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8016 + }, + { + "epoch": 6.40335463258786, + "grad_norm": 0.051862914115190506, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8017 + }, + { + "epoch": 6.404153354632588, + "grad_norm": 0.044119443744421005, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8018 + }, + { + "epoch": 6.404952076677317, + "grad_norm": 0.09787813574075699, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8019 + }, + { + "epoch": 6.405750798722044, + "grad_norm": 0.05269203707575798, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8020 + }, + { + "epoch": 6.406549520766773, + "grad_norm": 0.06683865934610367, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8021 + }, + { + "epoch": 6.407348242811501, + "grad_norm": 0.04334628954529762, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8022 + }, + { + "epoch": 6.40814696485623, + "grad_norm": 0.037559930235147476, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8023 + }, + { + "epoch": 6.4089456869009584, + "grad_norm": 0.21066749095916748, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8024 + }, + { + "epoch": 6.409744408945687, + "grad_norm": 0.05721563845872879, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8025 + }, + { + "epoch": 6.4105431309904155, + "grad_norm": 0.047683823853731155, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8026 + }, + { + "epoch": 6.411341853035144, + "grad_norm": 0.05377231910824776, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8027 + }, + { + "epoch": 6.412140575079873, + "grad_norm": 0.05604357272386551, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8028 + }, + { + "epoch": 6.4129392971246, + "grad_norm": 0.051680225878953934, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8029 + }, + { + "epoch": 6.413738019169329, + "grad_norm": 0.04465701803565025, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8030 + }, + { + "epoch": 6.414536741214057, + "grad_norm": 0.0454387366771698, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8031 + }, + { + "epoch": 6.415335463258786, + "grad_norm": 0.5079139471054077, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8032 + }, + { + "epoch": 6.416134185303514, + "grad_norm": 0.08386353403329849, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8033 + }, + { + "epoch": 6.416932907348243, + "grad_norm": 0.06023477017879486, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8034 + }, + { + "epoch": 6.417731629392971, + "grad_norm": 0.8634743094444275, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8035 + }, + { + "epoch": 6.4185303514377, + "grad_norm": 0.06926131993532181, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8036 + }, + { + "epoch": 6.419329073482428, + "grad_norm": 0.07563464343547821, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8037 + }, + { + "epoch": 6.420127795527157, + "grad_norm": 0.10181237757205963, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8038 + }, + { + "epoch": 6.420926517571885, + "grad_norm": 0.13995511829853058, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8039 + }, + { + "epoch": 6.421725239616613, + "grad_norm": 0.05968187376856804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8040 + }, + { + "epoch": 6.422523961661342, + "grad_norm": 0.14419680833816528, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8041 + }, + { + "epoch": 6.42332268370607, + "grad_norm": 0.13762469589710236, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8042 + }, + { + "epoch": 6.424121405750799, + "grad_norm": 0.0627644956111908, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8043 + }, + { + "epoch": 6.424920127795527, + "grad_norm": 0.1356768012046814, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8044 + }, + { + "epoch": 6.425718849840256, + "grad_norm": 0.12080833315849304, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8045 + }, + { + "epoch": 6.426517571884984, + "grad_norm": 0.048654112964868546, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8046 + }, + { + "epoch": 6.427316293929713, + "grad_norm": 0.11983022093772888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8047 + }, + { + "epoch": 6.428115015974441, + "grad_norm": 0.09429550170898438, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8048 + }, + { + "epoch": 6.428913738019169, + "grad_norm": 0.07924454659223557, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8049 + }, + { + "epoch": 6.4297124600638975, + "grad_norm": 0.15244926512241364, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8050 + }, + { + "epoch": 6.430511182108626, + "grad_norm": 0.9872325658798218, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8051 + }, + { + "epoch": 6.431309904153355, + "grad_norm": 0.0790395438671112, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8052 + }, + { + "epoch": 6.432108626198083, + "grad_norm": 0.3828068673610687, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8053 + }, + { + "epoch": 6.432907348242812, + "grad_norm": 0.059630244970321655, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8054 + }, + { + "epoch": 6.43370607028754, + "grad_norm": 0.07113327085971832, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8055 + }, + { + "epoch": 6.434504792332269, + "grad_norm": 0.0496523454785347, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8056 + }, + { + "epoch": 6.435303514376997, + "grad_norm": 0.08502436429262161, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8057 + }, + { + "epoch": 6.436102236421725, + "grad_norm": 0.06082376837730408, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8058 + }, + { + "epoch": 6.436900958466453, + "grad_norm": 0.1668524146080017, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8059 + }, + { + "epoch": 6.437699680511182, + "grad_norm": 0.05411513149738312, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8060 + }, + { + "epoch": 6.43849840255591, + "grad_norm": 0.05176519230008125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8061 + }, + { + "epoch": 6.439297124600639, + "grad_norm": 0.0684237852692604, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8062 + }, + { + "epoch": 6.4400958466453675, + "grad_norm": 0.0715038925409317, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8063 + }, + { + "epoch": 6.440894568690096, + "grad_norm": 0.11311113089323044, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8064 + }, + { + "epoch": 6.4416932907348246, + "grad_norm": 0.06320979446172714, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8065 + }, + { + "epoch": 6.442492012779553, + "grad_norm": 0.09221892803907394, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8066 + }, + { + "epoch": 6.443290734824281, + "grad_norm": 0.1183326244354248, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8067 + }, + { + "epoch": 6.444089456869009, + "grad_norm": 0.08447464555501938, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8068 + }, + { + "epoch": 6.444888178913738, + "grad_norm": 0.21791045367717743, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8069 + }, + { + "epoch": 6.445686900958466, + "grad_norm": 0.055015772581100464, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8070 + }, + { + "epoch": 6.446485623003195, + "grad_norm": 0.13536514341831207, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8071 + }, + { + "epoch": 6.447284345047923, + "grad_norm": 0.16620422899723053, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8072 + }, + { + "epoch": 6.448083067092652, + "grad_norm": 0.08793147653341293, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8073 + }, + { + "epoch": 6.44888178913738, + "grad_norm": 0.0962347462773323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8074 + }, + { + "epoch": 6.449680511182109, + "grad_norm": 0.08764681965112686, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 8075 + }, + { + "epoch": 6.4504792332268375, + "grad_norm": 0.06176106259226799, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8076 + }, + { + "epoch": 6.451277955271565, + "grad_norm": 0.06823577731847763, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8077 + }, + { + "epoch": 6.452076677316294, + "grad_norm": 0.11239560693502426, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8078 + }, + { + "epoch": 6.452875399361022, + "grad_norm": 0.10309527069330215, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8079 + }, + { + "epoch": 6.453674121405751, + "grad_norm": 0.07533836364746094, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8080 + }, + { + "epoch": 6.454472843450479, + "grad_norm": 0.06650671362876892, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8081 + }, + { + "epoch": 6.455271565495208, + "grad_norm": 0.1700691431760788, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8082 + }, + { + "epoch": 6.456070287539936, + "grad_norm": 0.06135572865605354, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8083 + }, + { + "epoch": 6.456869009584665, + "grad_norm": 0.08333424478769302, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8084 + }, + { + "epoch": 6.457667731629393, + "grad_norm": 0.1338927149772644, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8085 + }, + { + "epoch": 6.458466453674122, + "grad_norm": 0.07097163796424866, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8086 + }, + { + "epoch": 6.4592651757188495, + "grad_norm": 0.06296008080244064, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8087 + }, + { + "epoch": 6.460063897763578, + "grad_norm": 0.060656916350126266, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8088 + }, + { + "epoch": 6.460862619808307, + "grad_norm": 0.044889576733112335, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8089 + }, + { + "epoch": 6.461661341853035, + "grad_norm": 0.0749807357788086, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8090 + }, + { + "epoch": 6.462460063897764, + "grad_norm": 0.07509054243564606, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8091 + }, + { + "epoch": 6.463258785942492, + "grad_norm": 0.054954417049884796, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8092 + }, + { + "epoch": 6.464057507987221, + "grad_norm": 0.05087047815322876, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8093 + }, + { + "epoch": 6.464856230031949, + "grad_norm": 0.12205887585878372, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8094 + }, + { + "epoch": 6.465654952076678, + "grad_norm": 0.08342424035072327, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8095 + }, + { + "epoch": 6.466453674121405, + "grad_norm": 0.12507228553295135, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8096 + }, + { + "epoch": 6.467252396166134, + "grad_norm": 0.10491037368774414, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8097 + }, + { + "epoch": 6.468051118210862, + "grad_norm": 0.04236119985580444, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8098 + }, + { + "epoch": 6.468849840255591, + "grad_norm": 0.10601458698511124, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8099 + }, + { + "epoch": 6.4696485623003195, + "grad_norm": 0.07485921680927277, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8100 + }, + { + "epoch": 6.470447284345048, + "grad_norm": 0.06351220607757568, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8101 + }, + { + "epoch": 6.4712460063897765, + "grad_norm": 0.08351211249828339, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8102 + }, + { + "epoch": 6.472044728434505, + "grad_norm": 0.07205908000469208, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8103 + }, + { + "epoch": 6.472843450479234, + "grad_norm": 0.07072018831968307, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8104 + }, + { + "epoch": 6.473642172523961, + "grad_norm": 0.0851733461022377, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8105 + }, + { + "epoch": 6.47444089456869, + "grad_norm": 0.07046044617891312, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8106 + }, + { + "epoch": 6.475239616613418, + "grad_norm": 0.03804340958595276, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8107 + }, + { + "epoch": 6.476038338658147, + "grad_norm": 0.059083763509988785, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8108 + }, + { + "epoch": 6.476837060702875, + "grad_norm": 0.0419149249792099, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8109 + }, + { + "epoch": 6.477635782747604, + "grad_norm": 0.07814865559339523, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8110 + }, + { + "epoch": 6.478434504792332, + "grad_norm": 0.12653781473636627, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8111 + }, + { + "epoch": 6.479233226837061, + "grad_norm": 0.10124429315328598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8112 + }, + { + "epoch": 6.4800319488817895, + "grad_norm": 0.05563808232545853, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8113 + }, + { + "epoch": 6.480830670926517, + "grad_norm": 0.07036174833774567, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8114 + }, + { + "epoch": 6.481629392971246, + "grad_norm": 0.0452839694917202, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8115 + }, + { + "epoch": 6.482428115015974, + "grad_norm": 0.13880759477615356, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8116 + }, + { + "epoch": 6.483226837060703, + "grad_norm": 0.03902722895145416, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8117 + }, + { + "epoch": 6.484025559105431, + "grad_norm": 0.08136945217847824, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8118 + }, + { + "epoch": 6.48482428115016, + "grad_norm": 0.09874774515628815, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8119 + }, + { + "epoch": 6.485623003194888, + "grad_norm": 0.06836161017417908, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8120 + }, + { + "epoch": 6.486421725239617, + "grad_norm": 0.1439940482378006, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8121 + }, + { + "epoch": 6.487220447284345, + "grad_norm": 0.0924125388264656, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8122 + }, + { + "epoch": 6.488019169329074, + "grad_norm": 0.06811019778251648, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8123 + }, + { + "epoch": 6.488817891373802, + "grad_norm": 0.1259799599647522, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8124 + }, + { + "epoch": 6.48961661341853, + "grad_norm": 0.1088009849190712, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8125 + }, + { + "epoch": 6.4904153354632586, + "grad_norm": 0.27054721117019653, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8126 + }, + { + "epoch": 6.491214057507987, + "grad_norm": 0.09674181789159775, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8127 + }, + { + "epoch": 6.492012779552716, + "grad_norm": 0.15491390228271484, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8128 + }, + { + "epoch": 6.492811501597444, + "grad_norm": 0.08790267258882523, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8129 + }, + { + "epoch": 6.493610223642173, + "grad_norm": 0.19372408092021942, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8130 + }, + { + "epoch": 6.494408945686901, + "grad_norm": 0.14786171913146973, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8131 + }, + { + "epoch": 6.49520766773163, + "grad_norm": 0.09591338783502579, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8132 + }, + { + "epoch": 6.496006389776358, + "grad_norm": 0.1810663491487503, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8133 + }, + { + "epoch": 6.496805111821086, + "grad_norm": 0.19754691421985626, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8134 + }, + { + "epoch": 6.497603833865814, + "grad_norm": 0.14094877243041992, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8135 + }, + { + "epoch": 6.498402555910543, + "grad_norm": 0.0782506987452507, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8136 + }, + { + "epoch": 6.4992012779552715, + "grad_norm": 0.19543413817882538, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8137 + }, + { + "epoch": 6.5, + "grad_norm": 0.3102439045906067, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8138 + }, + { + "epoch": 6.5007987220447285, + "grad_norm": 0.13952040672302246, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8139 + }, + { + "epoch": 6.501597444089457, + "grad_norm": 0.1902403086423874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8140 + }, + { + "epoch": 6.502396166134186, + "grad_norm": 0.2608654499053955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8141 + }, + { + "epoch": 6.503194888178914, + "grad_norm": 0.22480152547359467, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8142 + }, + { + "epoch": 6.503993610223642, + "grad_norm": 0.21580660343170166, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8143 + }, + { + "epoch": 6.50479233226837, + "grad_norm": 0.1991831213235855, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8144 + }, + { + "epoch": 6.505591054313099, + "grad_norm": 0.25885632634162903, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8145 + }, + { + "epoch": 6.506389776357827, + "grad_norm": 0.2533574104309082, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8146 + }, + { + "epoch": 6.507188498402556, + "grad_norm": 0.11494381725788116, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8147 + }, + { + "epoch": 6.507987220447284, + "grad_norm": 0.1361113339662552, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8148 + }, + { + "epoch": 6.508785942492013, + "grad_norm": 0.22099947929382324, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8149 + }, + { + "epoch": 6.5095846645367414, + "grad_norm": 0.13223077356815338, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8150 + }, + { + "epoch": 6.51038338658147, + "grad_norm": 0.18203037977218628, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8151 + }, + { + "epoch": 6.511182108626198, + "grad_norm": 0.18066702783107758, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8152 + }, + { + "epoch": 6.511980830670926, + "grad_norm": 0.09984144568443298, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8153 + }, + { + "epoch": 6.512779552715655, + "grad_norm": 0.12803718447685242, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8154 + }, + { + "epoch": 6.513578274760383, + "grad_norm": 0.19731956720352173, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8155 + }, + { + "epoch": 6.514376996805112, + "grad_norm": 0.10687378793954849, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8156 + }, + { + "epoch": 6.51517571884984, + "grad_norm": 0.0971442237496376, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8157 + }, + { + "epoch": 6.515974440894569, + "grad_norm": 0.12840867042541504, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8158 + }, + { + "epoch": 6.516773162939297, + "grad_norm": 0.1245417669415474, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8159 + }, + { + "epoch": 6.517571884984026, + "grad_norm": 0.16850991547107697, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8160 + }, + { + "epoch": 6.518370607028754, + "grad_norm": 0.1931404322385788, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8161 + }, + { + "epoch": 6.519169329073483, + "grad_norm": 0.08180713653564453, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8162 + }, + { + "epoch": 6.5199680511182105, + "grad_norm": 0.24530328810214996, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8163 + }, + { + "epoch": 6.520766773162939, + "grad_norm": 0.14107894897460938, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8164 + }, + { + "epoch": 6.521565495207668, + "grad_norm": 0.07984111458063126, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8165 + }, + { + "epoch": 6.522364217252396, + "grad_norm": 0.20894968509674072, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8166 + }, + { + "epoch": 6.523162939297125, + "grad_norm": 0.09663927555084229, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8167 + }, + { + "epoch": 6.523961661341853, + "grad_norm": 0.0913434773683548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8168 + }, + { + "epoch": 6.524760383386582, + "grad_norm": 0.1247463971376419, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8169 + }, + { + "epoch": 6.52555910543131, + "grad_norm": 0.06504802405834198, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8170 + }, + { + "epoch": 6.526357827476039, + "grad_norm": 0.10900555551052094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8171 + }, + { + "epoch": 6.527156549520766, + "grad_norm": 0.047379642724990845, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8172 + }, + { + "epoch": 6.527955271565495, + "grad_norm": 0.17822134494781494, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8173 + }, + { + "epoch": 6.5287539936102235, + "grad_norm": 0.07658754289150238, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8174 + }, + { + "epoch": 6.529552715654952, + "grad_norm": 0.17294292151927948, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8175 + }, + { + "epoch": 6.5303514376996805, + "grad_norm": 0.07095851004123688, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8176 + }, + { + "epoch": 6.531150159744409, + "grad_norm": 0.07328472286462784, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8177 + }, + { + "epoch": 6.531948881789138, + "grad_norm": 0.11216691881418228, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8178 + }, + { + "epoch": 6.532747603833866, + "grad_norm": 0.3007374703884125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8179 + }, + { + "epoch": 6.533546325878595, + "grad_norm": 0.06059226021170616, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8180 + }, + { + "epoch": 6.534345047923322, + "grad_norm": 0.14438967406749725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8181 + }, + { + "epoch": 6.535143769968051, + "grad_norm": 0.1965394914150238, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8182 + }, + { + "epoch": 6.535942492012779, + "grad_norm": 0.130478173494339, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8183 + }, + { + "epoch": 6.536741214057508, + "grad_norm": 0.16713190078735352, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8184 + }, + { + "epoch": 6.537539936102236, + "grad_norm": 0.18644076585769653, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8185 + }, + { + "epoch": 6.538338658146965, + "grad_norm": 0.06685839593410492, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8186 + }, + { + "epoch": 6.539137380191693, + "grad_norm": 0.17819803953170776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8187 + }, + { + "epoch": 6.539936102236422, + "grad_norm": 0.5894746780395508, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8188 + }, + { + "epoch": 6.5407348242811505, + "grad_norm": 0.088719442486763, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8189 + }, + { + "epoch": 6.541533546325878, + "grad_norm": 0.1336045265197754, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8190 + }, + { + "epoch": 6.542332268370607, + "grad_norm": 0.12859520316123962, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8191 + }, + { + "epoch": 6.543130990415335, + "grad_norm": 0.13402487337589264, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8192 + }, + { + "epoch": 6.543929712460064, + "grad_norm": 0.11415290832519531, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8193 + }, + { + "epoch": 6.544728434504792, + "grad_norm": 0.1775715947151184, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8194 + }, + { + "epoch": 6.545527156549521, + "grad_norm": 0.6331294775009155, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8195 + }, + { + "epoch": 6.546325878594249, + "grad_norm": 0.09323445707559586, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8196 + }, + { + "epoch": 6.547124600638978, + "grad_norm": 0.1761421412229538, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8197 + }, + { + "epoch": 6.547923322683706, + "grad_norm": 0.09608824551105499, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8198 + }, + { + "epoch": 6.548722044728435, + "grad_norm": 0.07564207166433334, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8199 + }, + { + "epoch": 6.549520766773163, + "grad_norm": 0.08033318817615509, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8200 + }, + { + "epoch": 6.550319488817891, + "grad_norm": 0.13604776561260223, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8201 + }, + { + "epoch": 6.55111821086262, + "grad_norm": 0.1046299859881401, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8202 + }, + { + "epoch": 6.551916932907348, + "grad_norm": 0.23783712089061737, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8203 + }, + { + "epoch": 6.552715654952077, + "grad_norm": 0.07360750436782837, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8204 + }, + { + "epoch": 6.553514376996805, + "grad_norm": 0.07213526219129562, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8205 + }, + { + "epoch": 6.554313099041534, + "grad_norm": 0.12431066483259201, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8206 + }, + { + "epoch": 6.555111821086262, + "grad_norm": 0.09665104001760483, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8207 + }, + { + "epoch": 6.555910543130991, + "grad_norm": 0.22090987861156464, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8208 + }, + { + "epoch": 6.556709265175719, + "grad_norm": 0.14936690032482147, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8209 + }, + { + "epoch": 6.557507987220447, + "grad_norm": 0.09804648160934448, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8210 + }, + { + "epoch": 6.5583067092651754, + "grad_norm": 0.07829400897026062, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8211 + }, + { + "epoch": 6.559105431309904, + "grad_norm": 0.08218041807413101, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8212 + }, + { + "epoch": 6.5599041533546325, + "grad_norm": 0.08018422871828079, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8213 + }, + { + "epoch": 6.560702875399361, + "grad_norm": 0.07790627330541611, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8214 + }, + { + "epoch": 6.56150159744409, + "grad_norm": 0.12526501715183258, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8215 + }, + { + "epoch": 6.562300319488818, + "grad_norm": 0.15222279727458954, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8216 + }, + { + "epoch": 6.563099041533547, + "grad_norm": 0.19605369865894318, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8217 + }, + { + "epoch": 6.563897763578275, + "grad_norm": 1.4426831007003784, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8218 + }, + { + "epoch": 6.564696485623003, + "grad_norm": 0.184299498796463, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8219 + }, + { + "epoch": 6.565495207667731, + "grad_norm": 0.12029392272233963, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8220 + }, + { + "epoch": 6.56629392971246, + "grad_norm": 0.07442726939916611, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8221 + }, + { + "epoch": 6.567092651757188, + "grad_norm": 0.14331156015396118, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8222 + }, + { + "epoch": 6.567891373801917, + "grad_norm": 0.11202000081539154, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 8223 + }, + { + "epoch": 6.568690095846645, + "grad_norm": 0.10699515789747238, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8224 + }, + { + "epoch": 6.569488817891374, + "grad_norm": 0.07708705961704254, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8225 + }, + { + "epoch": 6.5702875399361025, + "grad_norm": 0.08026644587516785, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8226 + }, + { + "epoch": 6.571086261980831, + "grad_norm": 0.08694002777338028, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8227 + }, + { + "epoch": 6.571884984025559, + "grad_norm": 0.11824248731136322, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8228 + }, + { + "epoch": 6.572683706070287, + "grad_norm": 0.06505008041858673, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8229 + }, + { + "epoch": 6.573482428115016, + "grad_norm": 0.05341152846813202, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8230 + }, + { + "epoch": 6.574281150159744, + "grad_norm": 0.09604120999574661, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8231 + }, + { + "epoch": 6.575079872204473, + "grad_norm": 0.08336330950260162, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8232 + }, + { + "epoch": 6.575878594249201, + "grad_norm": 0.06368359923362732, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8233 + }, + { + "epoch": 6.57667731629393, + "grad_norm": 0.13115698099136353, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8234 + }, + { + "epoch": 6.577476038338658, + "grad_norm": 0.08847527951002121, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8235 + }, + { + "epoch": 6.578274760383387, + "grad_norm": 0.0458359532058239, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8236 + }, + { + "epoch": 6.5790734824281145, + "grad_norm": 0.10106709599494934, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8237 + }, + { + "epoch": 6.579872204472844, + "grad_norm": 0.06641486287117004, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8238 + }, + { + "epoch": 6.580670926517572, + "grad_norm": 0.0733480304479599, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8239 + }, + { + "epoch": 6.5814696485623, + "grad_norm": 0.07835566252470016, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8240 + }, + { + "epoch": 6.582268370607029, + "grad_norm": 0.13473013043403625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8241 + }, + { + "epoch": 6.583067092651757, + "grad_norm": 0.062259674072265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8242 + }, + { + "epoch": 6.583865814696486, + "grad_norm": 0.05236242339015007, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8243 + }, + { + "epoch": 6.584664536741214, + "grad_norm": 0.08255355805158615, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8244 + }, + { + "epoch": 6.585463258785943, + "grad_norm": 0.1182556301355362, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8245 + }, + { + "epoch": 6.586261980830671, + "grad_norm": 0.0555981881916523, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8246 + }, + { + "epoch": 6.5870607028754, + "grad_norm": 0.09490877389907837, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8247 + }, + { + "epoch": 6.587859424920127, + "grad_norm": 0.6106880903244019, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8248 + }, + { + "epoch": 6.588658146964856, + "grad_norm": 0.0474761538207531, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8249 + }, + { + "epoch": 6.5894568690095845, + "grad_norm": 0.1429997831583023, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8250 + }, + { + "epoch": 6.590255591054313, + "grad_norm": 0.0815487951040268, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8251 + }, + { + "epoch": 6.5910543130990416, + "grad_norm": 0.096903957426548, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8252 + }, + { + "epoch": 6.59185303514377, + "grad_norm": 0.17775478959083557, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8253 + }, + { + "epoch": 6.592651757188499, + "grad_norm": 0.11637275665998459, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8254 + }, + { + "epoch": 6.593450479233227, + "grad_norm": 0.08475788682699203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8255 + }, + { + "epoch": 6.594249201277956, + "grad_norm": 0.1786298304796219, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8256 + }, + { + "epoch": 6.595047923322683, + "grad_norm": 0.12316745519638062, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8257 + }, + { + "epoch": 6.595846645367412, + "grad_norm": 0.5367861986160278, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8258 + }, + { + "epoch": 6.59664536741214, + "grad_norm": 0.2289825677871704, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8259 + }, + { + "epoch": 6.597444089456869, + "grad_norm": 0.17333106696605682, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8260 + }, + { + "epoch": 6.598242811501597, + "grad_norm": 0.10858172923326492, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8261 + }, + { + "epoch": 6.599041533546326, + "grad_norm": 0.2013384997844696, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8262 + }, + { + "epoch": 6.5998402555910545, + "grad_norm": 0.13658639788627625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8263 + }, + { + "epoch": 6.600638977635783, + "grad_norm": 0.12755805253982544, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8264 + }, + { + "epoch": 6.6014376996805115, + "grad_norm": 0.18299050629138947, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 6.602236421725239, + "grad_norm": 0.07105828821659088, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8266 + }, + { + "epoch": 6.603035143769968, + "grad_norm": 0.13049830496311188, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8267 + }, + { + "epoch": 6.603833865814696, + "grad_norm": 0.16121532022953033, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8268 + }, + { + "epoch": 6.604632587859425, + "grad_norm": 0.07512015104293823, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8269 + }, + { + "epoch": 6.605431309904153, + "grad_norm": 0.17407254874706268, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8270 + }, + { + "epoch": 6.606230031948882, + "grad_norm": 0.11297854781150818, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8271 + }, + { + "epoch": 6.60702875399361, + "grad_norm": 0.2839175760746002, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8272 + }, + { + "epoch": 6.607827476038339, + "grad_norm": 0.07847599685192108, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8273 + }, + { + "epoch": 6.608626198083067, + "grad_norm": 0.08995212614536285, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8274 + }, + { + "epoch": 6.609424920127795, + "grad_norm": 0.07382770627737045, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8275 + }, + { + "epoch": 6.6102236421725244, + "grad_norm": 0.06170637533068657, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8276 + }, + { + "epoch": 6.611022364217252, + "grad_norm": 0.07311394810676575, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8277 + }, + { + "epoch": 6.611821086261981, + "grad_norm": 0.06827707588672638, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8278 + }, + { + "epoch": 6.612619808306709, + "grad_norm": 0.05261022970080376, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8279 + }, + { + "epoch": 6.613418530351438, + "grad_norm": 0.11326271295547485, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8280 + }, + { + "epoch": 6.614217252396166, + "grad_norm": 0.1652819961309433, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8281 + }, + { + "epoch": 6.615015974440895, + "grad_norm": 0.10749676078557968, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8282 + }, + { + "epoch": 6.615814696485623, + "grad_norm": 0.20359984040260315, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8283 + }, + { + "epoch": 6.616613418530352, + "grad_norm": 0.18771138787269592, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8284 + }, + { + "epoch": 6.61741214057508, + "grad_norm": 2.5382773876190186, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8285 + }, + { + "epoch": 6.618210862619808, + "grad_norm": 0.30566683411598206, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8286 + }, + { + "epoch": 6.6190095846645365, + "grad_norm": 0.3638366758823395, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8287 + }, + { + "epoch": 6.619808306709265, + "grad_norm": 0.10939022153615952, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8288 + }, + { + "epoch": 6.6206070287539935, + "grad_norm": 0.3243744969367981, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8289 + }, + { + "epoch": 6.621405750798722, + "grad_norm": 0.2703976333141327, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8290 + }, + { + "epoch": 6.622204472843451, + "grad_norm": 0.06998306512832642, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8291 + }, + { + "epoch": 6.623003194888179, + "grad_norm": 0.25409170985221863, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8292 + }, + { + "epoch": 6.623801916932908, + "grad_norm": 0.110246442258358, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8293 + }, + { + "epoch": 6.624600638977636, + "grad_norm": 0.1667647659778595, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8294 + }, + { + "epoch": 6.625399361022364, + "grad_norm": 0.17452718317508698, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8295 + }, + { + "epoch": 6.626198083067092, + "grad_norm": 0.11691702157258987, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8296 + }, + { + "epoch": 6.626996805111821, + "grad_norm": 0.14679500460624695, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8297 + }, + { + "epoch": 6.627795527156549, + "grad_norm": 0.06978808343410492, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8298 + }, + { + "epoch": 6.628594249201278, + "grad_norm": 0.36758533120155334, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8299 + }, + { + "epoch": 6.6293929712460065, + "grad_norm": 0.11101481318473816, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8300 + }, + { + "epoch": 6.630191693290735, + "grad_norm": 0.11762239784002304, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8301 + }, + { + "epoch": 6.6309904153354635, + "grad_norm": 0.11467000097036362, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8302 + }, + { + "epoch": 6.631789137380192, + "grad_norm": 0.14236292243003845, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8303 + }, + { + "epoch": 6.63258785942492, + "grad_norm": 0.050860557705163956, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8304 + }, + { + "epoch": 6.633386581469648, + "grad_norm": 0.07763084024190903, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 8305 + }, + { + "epoch": 6.634185303514377, + "grad_norm": 0.06728993356227875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8306 + }, + { + "epoch": 6.634984025559105, + "grad_norm": 0.06984454393386841, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8307 + }, + { + "epoch": 6.635782747603834, + "grad_norm": 0.09839699417352676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8308 + }, + { + "epoch": 6.636581469648562, + "grad_norm": 0.1262810379266739, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8309 + }, + { + "epoch": 6.637380191693291, + "grad_norm": 0.08147390931844711, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8310 + }, + { + "epoch": 6.638178913738019, + "grad_norm": 0.11567803472280502, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8311 + }, + { + "epoch": 6.638977635782748, + "grad_norm": 0.14972445368766785, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8312 + }, + { + "epoch": 6.6397763578274756, + "grad_norm": 0.2970331609249115, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8313 + }, + { + "epoch": 6.640575079872205, + "grad_norm": 0.05576174706220627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8314 + }, + { + "epoch": 6.641373801916933, + "grad_norm": 0.048716023564338684, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8315 + }, + { + "epoch": 6.642172523961661, + "grad_norm": 0.05986058712005615, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8316 + }, + { + "epoch": 6.64297124600639, + "grad_norm": 0.07985493540763855, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8317 + }, + { + "epoch": 6.643769968051118, + "grad_norm": 0.5361261963844299, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 8318 + }, + { + "epoch": 6.644568690095847, + "grad_norm": 0.15383858978748322, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8319 + }, + { + "epoch": 6.645367412140575, + "grad_norm": 0.17428068816661835, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8320 + }, + { + "epoch": 6.646166134185304, + "grad_norm": 0.09801791608333588, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8321 + }, + { + "epoch": 6.646964856230032, + "grad_norm": 0.11805883049964905, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8322 + }, + { + "epoch": 6.647763578274761, + "grad_norm": 0.13135986030101776, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8323 + }, + { + "epoch": 6.6485623003194885, + "grad_norm": 0.10351908206939697, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8324 + }, + { + "epoch": 6.649361022364217, + "grad_norm": 0.11086217314004898, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8325 + }, + { + "epoch": 6.6501597444089455, + "grad_norm": 0.1173853799700737, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8326 + }, + { + "epoch": 6.650958466453674, + "grad_norm": 0.10743618756532669, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8327 + }, + { + "epoch": 6.651757188498403, + "grad_norm": 0.5378667116165161, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8328 + }, + { + "epoch": 6.652555910543131, + "grad_norm": 0.5077546834945679, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8329 + }, + { + "epoch": 6.65335463258786, + "grad_norm": 0.21998530626296997, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8330 + }, + { + "epoch": 6.654153354632588, + "grad_norm": 0.1235295757651329, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8331 + }, + { + "epoch": 6.654952076677317, + "grad_norm": 0.7328196167945862, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 8332 + }, + { + "epoch": 6.655750798722044, + "grad_norm": 0.12249958515167236, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8333 + }, + { + "epoch": 6.656549520766773, + "grad_norm": 0.12837325036525726, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8334 + }, + { + "epoch": 6.657348242811501, + "grad_norm": 0.09456688165664673, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8335 + }, + { + "epoch": 6.65814696485623, + "grad_norm": 0.13044698536396027, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8336 + }, + { + "epoch": 6.6589456869009584, + "grad_norm": 0.13105876743793488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8337 + }, + { + "epoch": 6.659744408945687, + "grad_norm": 0.14498500525951385, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8338 + }, + { + "epoch": 6.6605431309904155, + "grad_norm": 0.08840721845626831, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8339 + }, + { + "epoch": 6.661341853035144, + "grad_norm": 1.276719570159912, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8340 + }, + { + "epoch": 6.662140575079873, + "grad_norm": 0.36189836263656616, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 8341 + }, + { + "epoch": 6.6629392971246, + "grad_norm": 0.6304068565368652, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 8342 + }, + { + "epoch": 6.663738019169329, + "grad_norm": 0.524870753288269, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 8343 + }, + { + "epoch": 6.664536741214057, + "grad_norm": 0.14638005197048187, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8344 + }, + { + "epoch": 6.665335463258786, + "grad_norm": 0.3090416491031647, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 8345 + }, + { + "epoch": 6.666134185303514, + "grad_norm": 0.1549086570739746, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8346 + }, + { + "epoch": 6.666932907348243, + "grad_norm": 0.36996960639953613, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8347 + }, + { + "epoch": 6.667731629392971, + "grad_norm": 0.4879205524921417, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 8348 + }, + { + "epoch": 6.6685303514377, + "grad_norm": 0.6129382848739624, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8349 + }, + { + "epoch": 6.669329073482428, + "grad_norm": 0.37913191318511963, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8350 + }, + { + "epoch": 6.670127795527156, + "grad_norm": 0.1678311973810196, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 8351 + }, + { + "epoch": 6.6709265175718855, + "grad_norm": 0.17131182551383972, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8352 + }, + { + "epoch": 6.671725239616613, + "grad_norm": 0.29875028133392334, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8353 + }, + { + "epoch": 6.672523961661342, + "grad_norm": 0.5288842916488647, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8354 + }, + { + "epoch": 6.67332268370607, + "grad_norm": 0.24637238681316376, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8355 + }, + { + "epoch": 6.674121405750799, + "grad_norm": 0.25089535117149353, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8356 + }, + { + "epoch": 6.674920127795527, + "grad_norm": 0.5517246723175049, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8357 + }, + { + "epoch": 6.675718849840256, + "grad_norm": 0.07291965931653976, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8358 + }, + { + "epoch": 6.676517571884984, + "grad_norm": 0.2561021149158478, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8359 + }, + { + "epoch": 6.677316293929713, + "grad_norm": 0.2184453308582306, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8360 + }, + { + "epoch": 6.678115015974441, + "grad_norm": 0.10715393722057343, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8361 + }, + { + "epoch": 6.678913738019169, + "grad_norm": 0.16824330389499664, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8362 + }, + { + "epoch": 6.6797124600638975, + "grad_norm": 0.22539092600345612, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8363 + }, + { + "epoch": 6.680511182108626, + "grad_norm": 0.11956257373094559, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8364 + }, + { + "epoch": 6.681309904153355, + "grad_norm": 0.2023434042930603, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8365 + }, + { + "epoch": 6.682108626198083, + "grad_norm": 0.26878416538238525, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8366 + }, + { + "epoch": 6.682907348242812, + "grad_norm": 0.11318770796060562, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8367 + }, + { + "epoch": 6.68370607028754, + "grad_norm": 0.29282090067863464, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8368 + }, + { + "epoch": 6.684504792332269, + "grad_norm": 0.23825445771217346, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8369 + }, + { + "epoch": 6.685303514376997, + "grad_norm": 0.27186012268066406, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 8370 + }, + { + "epoch": 6.686102236421725, + "grad_norm": 0.28540825843811035, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8371 + }, + { + "epoch": 6.686900958466453, + "grad_norm": 0.14273707568645477, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8372 + }, + { + "epoch": 6.687699680511182, + "grad_norm": 0.3684747815132141, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8373 + }, + { + "epoch": 6.68849840255591, + "grad_norm": 0.23812046647071838, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8374 + }, + { + "epoch": 6.689297124600639, + "grad_norm": 0.15459395945072174, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8375 + }, + { + "epoch": 6.6900958466453675, + "grad_norm": 0.28762584924697876, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8376 + }, + { + "epoch": 6.690894568690096, + "grad_norm": 0.16686615347862244, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8377 + }, + { + "epoch": 6.6916932907348246, + "grad_norm": 0.16456246376037598, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8378 + }, + { + "epoch": 6.692492012779553, + "grad_norm": 0.2991560399532318, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8379 + }, + { + "epoch": 6.693290734824281, + "grad_norm": 0.14811092615127563, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8380 + }, + { + "epoch": 6.694089456869009, + "grad_norm": 0.14380809664726257, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8381 + }, + { + "epoch": 6.694888178913738, + "grad_norm": 0.0801207646727562, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8382 + }, + { + "epoch": 6.695686900958466, + "grad_norm": 0.08404620736837387, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8383 + }, + { + "epoch": 6.696485623003195, + "grad_norm": 0.1137305274605751, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8384 + }, + { + "epoch": 6.697284345047923, + "grad_norm": 0.08207721263170242, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8385 + }, + { + "epoch": 6.698083067092652, + "grad_norm": 0.09234748780727386, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8386 + }, + { + "epoch": 6.69888178913738, + "grad_norm": 0.29589149355888367, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8387 + }, + { + "epoch": 6.699680511182109, + "grad_norm": 0.2142077386379242, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8388 + }, + { + "epoch": 6.700479233226837, + "grad_norm": 0.10343299061059952, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8389 + }, + { + "epoch": 6.701277955271565, + "grad_norm": 0.12988241016864777, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8390 + }, + { + "epoch": 6.702076677316294, + "grad_norm": 0.20497195422649384, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 8391 + }, + { + "epoch": 6.702875399361022, + "grad_norm": 0.10697030276060104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8392 + }, + { + "epoch": 6.703674121405751, + "grad_norm": 0.1844921112060547, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8393 + }, + { + "epoch": 6.704472843450479, + "grad_norm": 0.13283176720142365, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8394 + }, + { + "epoch": 6.705271565495208, + "grad_norm": 0.14544987678527832, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8395 + }, + { + "epoch": 6.706070287539936, + "grad_norm": 0.10253588855266571, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8396 + }, + { + "epoch": 6.706869009584665, + "grad_norm": 0.11183217167854309, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8397 + }, + { + "epoch": 6.707667731629393, + "grad_norm": 0.12705212831497192, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8398 + }, + { + "epoch": 6.708466453674122, + "grad_norm": 0.08835884928703308, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8399 + }, + { + "epoch": 6.7092651757188495, + "grad_norm": 0.22377537190914154, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8400 + }, + { + "epoch": 6.710063897763578, + "grad_norm": 0.7205986976623535, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8401 + }, + { + "epoch": 6.710862619808307, + "grad_norm": 0.07383892685174942, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8402 + }, + { + "epoch": 6.711661341853035, + "grad_norm": 0.11109078675508499, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8403 + }, + { + "epoch": 6.712460063897764, + "grad_norm": 0.10979527235031128, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8404 + }, + { + "epoch": 6.713258785942492, + "grad_norm": 0.062491416931152344, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8405 + }, + { + "epoch": 6.714057507987221, + "grad_norm": 0.11196211725473404, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8406 + }, + { + "epoch": 6.714856230031949, + "grad_norm": 0.07815852016210556, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8407 + }, + { + "epoch": 6.715654952076678, + "grad_norm": 3.9684712886810303, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8408 + }, + { + "epoch": 6.716453674121405, + "grad_norm": 0.11982189118862152, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8409 + }, + { + "epoch": 6.717252396166134, + "grad_norm": 0.22319400310516357, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8410 + }, + { + "epoch": 6.718051118210862, + "grad_norm": 0.0937948003411293, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8411 + }, + { + "epoch": 6.718849840255591, + "grad_norm": 0.09193865954875946, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8412 + }, + { + "epoch": 6.7196485623003195, + "grad_norm": 0.08838166296482086, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8413 + }, + { + "epoch": 6.720447284345048, + "grad_norm": 0.0960271805524826, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8414 + }, + { + "epoch": 6.7212460063897765, + "grad_norm": 0.07488188147544861, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8415 + }, + { + "epoch": 6.722044728434505, + "grad_norm": 0.08563253283500671, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8416 + }, + { + "epoch": 6.722843450479234, + "grad_norm": 0.16766750812530518, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8417 + }, + { + "epoch": 6.723642172523961, + "grad_norm": 0.12811559438705444, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8418 + }, + { + "epoch": 6.72444089456869, + "grad_norm": 0.12410838901996613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8419 + }, + { + "epoch": 6.725239616613418, + "grad_norm": 0.1354755014181137, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8420 + }, + { + "epoch": 6.726038338658147, + "grad_norm": 0.17771920561790466, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8421 + }, + { + "epoch": 6.726837060702875, + "grad_norm": 0.19576571881771088, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8422 + }, + { + "epoch": 6.727635782747604, + "grad_norm": 0.5415527820587158, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 8423 + }, + { + "epoch": 6.728434504792332, + "grad_norm": 0.6647717952728271, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8424 + }, + { + "epoch": 6.729233226837061, + "grad_norm": 0.16329380869865417, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8425 + }, + { + "epoch": 6.7300319488817895, + "grad_norm": 0.4046335518360138, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8426 + }, + { + "epoch": 6.730830670926517, + "grad_norm": 0.1817079335451126, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8427 + }, + { + "epoch": 6.731629392971246, + "grad_norm": 0.3438379466533661, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 8428 + }, + { + "epoch": 6.732428115015974, + "grad_norm": 0.48276495933532715, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8429 + }, + { + "epoch": 6.733226837060703, + "grad_norm": 0.4002913236618042, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8430 + }, + { + "epoch": 6.734025559105431, + "grad_norm": 0.37833303213119507, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8431 + }, + { + "epoch": 6.73482428115016, + "grad_norm": 0.26374873518943787, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8432 + }, + { + "epoch": 6.735623003194888, + "grad_norm": 0.19766554236412048, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8433 + }, + { + "epoch": 6.736421725239617, + "grad_norm": 0.1996731013059616, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8434 + }, + { + "epoch": 6.737220447284345, + "grad_norm": 0.19733403623104095, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8435 + }, + { + "epoch": 6.738019169329074, + "grad_norm": 0.24423246085643768, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8436 + }, + { + "epoch": 6.738817891373802, + "grad_norm": 0.4329655170440674, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8437 + }, + { + "epoch": 6.73961661341853, + "grad_norm": 0.6964716911315918, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8438 + }, + { + "epoch": 6.7404153354632586, + "grad_norm": 0.12961135804653168, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8439 + }, + { + "epoch": 6.741214057507987, + "grad_norm": 0.2783071994781494, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8440 + }, + { + "epoch": 6.742012779552716, + "grad_norm": 0.3446369767189026, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8441 + }, + { + "epoch": 6.742811501597444, + "grad_norm": 0.22592051327228546, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8442 + }, + { + "epoch": 6.743610223642173, + "grad_norm": 0.06710102409124374, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8443 + }, + { + "epoch": 6.744408945686901, + "grad_norm": 0.2268608957529068, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8444 + }, + { + "epoch": 6.74520766773163, + "grad_norm": 0.08200005441904068, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8445 + }, + { + "epoch": 6.746006389776358, + "grad_norm": 0.2357168197631836, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8446 + }, + { + "epoch": 6.746805111821086, + "grad_norm": 0.20047837495803833, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8447 + }, + { + "epoch": 6.747603833865814, + "grad_norm": 0.2309340387582779, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8448 + }, + { + "epoch": 6.748402555910543, + "grad_norm": 0.11635745316743851, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8449 + }, + { + "epoch": 6.7492012779552715, + "grad_norm": 0.4076550602912903, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8450 + }, + { + "epoch": 6.75, + "grad_norm": 0.3500226140022278, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8451 + }, + { + "epoch": 6.7507987220447285, + "grad_norm": 0.2993873357772827, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8452 + }, + { + "epoch": 6.751597444089457, + "grad_norm": 0.1099642813205719, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8453 + }, + { + "epoch": 6.752396166134186, + "grad_norm": 0.17455045878887177, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8454 + }, + { + "epoch": 6.753194888178914, + "grad_norm": 0.12831585109233856, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8455 + }, + { + "epoch": 6.753993610223642, + "grad_norm": 0.1048964336514473, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8456 + }, + { + "epoch": 6.75479233226837, + "grad_norm": 0.16713464260101318, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8457 + }, + { + "epoch": 6.755591054313099, + "grad_norm": 0.07837880402803421, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8458 + }, + { + "epoch": 6.756389776357827, + "grad_norm": 0.17375724017620087, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8459 + }, + { + "epoch": 6.757188498402556, + "grad_norm": 0.9700595140457153, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8460 + }, + { + "epoch": 6.757987220447284, + "grad_norm": 0.23614056408405304, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8461 + }, + { + "epoch": 6.758785942492013, + "grad_norm": 0.2536165416240692, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 8462 + }, + { + "epoch": 6.7595846645367414, + "grad_norm": 0.26688873767852783, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 8463 + }, + { + "epoch": 6.76038338658147, + "grad_norm": 0.3807159662246704, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 8464 + }, + { + "epoch": 6.761182108626198, + "grad_norm": 0.2132156789302826, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8465 + }, + { + "epoch": 6.761980830670926, + "grad_norm": 0.19821512699127197, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 8466 + }, + { + "epoch": 6.762779552715655, + "grad_norm": 0.23694948852062225, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 8467 + }, + { + "epoch": 6.763578274760383, + "grad_norm": 0.1524704396724701, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 8468 + }, + { + "epoch": 6.764376996805112, + "grad_norm": 0.26719930768013, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 8469 + }, + { + "epoch": 6.76517571884984, + "grad_norm": 0.12077363580465317, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8470 + }, + { + "epoch": 6.765974440894569, + "grad_norm": 0.14398355782032013, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8471 + }, + { + "epoch": 6.766773162939297, + "grad_norm": 0.1972649097442627, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8472 + }, + { + "epoch": 6.767571884984026, + "grad_norm": 0.10172676295042038, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8473 + }, + { + "epoch": 6.768370607028754, + "grad_norm": 0.10743385553359985, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8474 + }, + { + "epoch": 6.769169329073483, + "grad_norm": 0.06148320063948631, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8475 + }, + { + "epoch": 6.7699680511182105, + "grad_norm": 0.08771604299545288, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8476 + }, + { + "epoch": 6.770766773162939, + "grad_norm": 0.13444122672080994, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8477 + }, + { + "epoch": 6.771565495207668, + "grad_norm": 0.4677158296108246, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8478 + }, + { + "epoch": 6.772364217252396, + "grad_norm": 0.08972432464361191, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8479 + }, + { + "epoch": 6.773162939297125, + "grad_norm": 0.10502214729785919, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8480 + }, + { + "epoch": 6.773961661341853, + "grad_norm": 0.14014923572540283, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8481 + }, + { + "epoch": 6.774760383386582, + "grad_norm": 0.3244888484477997, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8482 + }, + { + "epoch": 6.77555910543131, + "grad_norm": 0.20495742559432983, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8483 + }, + { + "epoch": 6.776357827476039, + "grad_norm": 0.15609663724899292, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8484 + }, + { + "epoch": 6.777156549520766, + "grad_norm": 0.13948239386081696, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8485 + }, + { + "epoch": 6.777955271565495, + "grad_norm": 0.28558677434921265, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8486 + }, + { + "epoch": 6.7787539936102235, + "grad_norm": 0.1481117457151413, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8487 + }, + { + "epoch": 6.779552715654952, + "grad_norm": 0.31998512148857117, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8488 + }, + { + "epoch": 6.7803514376996805, + "grad_norm": 0.1945921927690506, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8489 + }, + { + "epoch": 6.781150159744409, + "grad_norm": 18.217361450195312, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8490 + }, + { + "epoch": 6.781948881789138, + "grad_norm": 0.23472756147384644, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 8491 + }, + { + "epoch": 6.782747603833866, + "grad_norm": 0.10026291757822037, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8492 + }, + { + "epoch": 6.783546325878595, + "grad_norm": 0.14418581128120422, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8493 + }, + { + "epoch": 6.784345047923322, + "grad_norm": 0.14439892768859863, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8494 + }, + { + "epoch": 6.785143769968051, + "grad_norm": 0.37140071392059326, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8495 + }, + { + "epoch": 6.785942492012779, + "grad_norm": 0.09995266050100327, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8496 + }, + { + "epoch": 6.786741214057508, + "grad_norm": 0.08430355042219162, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8497 + }, + { + "epoch": 6.787539936102236, + "grad_norm": 0.11121980845928192, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8498 + }, + { + "epoch": 6.788338658146965, + "grad_norm": 0.20520392060279846, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8499 + }, + { + "epoch": 6.789137380191693, + "grad_norm": 0.10163573920726776, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8500 + }, + { + "epoch": 6.789936102236422, + "grad_norm": 0.12025435268878937, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8501 + }, + { + "epoch": 6.7907348242811505, + "grad_norm": 0.12003593891859055, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8502 + }, + { + "epoch": 6.791533546325878, + "grad_norm": 0.11013154685497284, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8503 + }, + { + "epoch": 6.792332268370607, + "grad_norm": 0.10089465230703354, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8504 + }, + { + "epoch": 6.793130990415335, + "grad_norm": 0.06270314007997513, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8505 + }, + { + "epoch": 6.793929712460064, + "grad_norm": 0.08571597188711166, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8506 + }, + { + "epoch": 6.794728434504792, + "grad_norm": 0.5324975848197937, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8507 + }, + { + "epoch": 6.795527156549521, + "grad_norm": 0.24500170350074768, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8508 + }, + { + "epoch": 6.796325878594249, + "grad_norm": 0.10234003514051437, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8509 + }, + { + "epoch": 6.797124600638978, + "grad_norm": 0.09924131631851196, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8510 + }, + { + "epoch": 6.797923322683706, + "grad_norm": 0.1413181573152542, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8511 + }, + { + "epoch": 6.798722044728435, + "grad_norm": 0.12095441669225693, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8512 + }, + { + "epoch": 6.799520766773163, + "grad_norm": 0.08617071062326431, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8513 + }, + { + "epoch": 6.800319488817891, + "grad_norm": 0.17984576523303986, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 8514 + }, + { + "epoch": 6.80111821086262, + "grad_norm": 0.16447608172893524, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8515 + }, + { + "epoch": 6.801916932907348, + "grad_norm": 0.15486668050289154, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8516 + }, + { + "epoch": 6.802715654952077, + "grad_norm": 0.10176295787096024, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8517 + }, + { + "epoch": 6.803514376996805, + "grad_norm": 0.14911721646785736, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8518 + }, + { + "epoch": 6.804313099041534, + "grad_norm": 0.11073625087738037, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8519 + }, + { + "epoch": 6.805111821086262, + "grad_norm": 0.10299605876207352, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8520 + }, + { + "epoch": 6.805910543130991, + "grad_norm": 0.189669668674469, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8521 + }, + { + "epoch": 6.806709265175719, + "grad_norm": 0.12226799875497818, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8522 + }, + { + "epoch": 6.807507987220447, + "grad_norm": 0.17778469622135162, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8523 + }, + { + "epoch": 6.8083067092651754, + "grad_norm": 0.16370487213134766, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8524 + }, + { + "epoch": 6.809105431309904, + "grad_norm": 0.05171172693371773, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8525 + }, + { + "epoch": 6.8099041533546325, + "grad_norm": 0.16393537819385529, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8526 + }, + { + "epoch": 6.810702875399361, + "grad_norm": 0.09398743510246277, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8527 + }, + { + "epoch": 6.81150159744409, + "grad_norm": 0.08430743217468262, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8528 + }, + { + "epoch": 6.812300319488818, + "grad_norm": 0.1131691113114357, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 8529 + }, + { + "epoch": 6.813099041533547, + "grad_norm": 0.0907130092382431, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8530 + }, + { + "epoch": 6.813897763578275, + "grad_norm": 0.1460096687078476, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8531 + }, + { + "epoch": 6.814696485623003, + "grad_norm": 0.07953288406133652, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8532 + }, + { + "epoch": 6.815495207667731, + "grad_norm": 0.061827294528484344, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 8533 + }, + { + "epoch": 6.81629392971246, + "grad_norm": 0.09172365814447403, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8534 + }, + { + "epoch": 6.817092651757188, + "grad_norm": 0.05858466029167175, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8535 + }, + { + "epoch": 6.817891373801917, + "grad_norm": 0.13774308562278748, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8536 + }, + { + "epoch": 6.818690095846645, + "grad_norm": 0.09840130060911179, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8537 + }, + { + "epoch": 6.819488817891374, + "grad_norm": 0.06836584210395813, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8538 + }, + { + "epoch": 6.8202875399361025, + "grad_norm": 0.15930971503257751, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8539 + }, + { + "epoch": 6.821086261980831, + "grad_norm": 0.12306738644838333, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8540 + }, + { + "epoch": 6.821884984025559, + "grad_norm": 0.09868071228265762, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8541 + }, + { + "epoch": 6.822683706070287, + "grad_norm": 0.09411876648664474, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8542 + }, + { + "epoch": 6.823482428115016, + "grad_norm": 0.09062112122774124, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8543 + }, + { + "epoch": 6.824281150159744, + "grad_norm": 0.14964330196380615, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8544 + }, + { + "epoch": 6.825079872204473, + "grad_norm": 0.1444161832332611, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8545 + }, + { + "epoch": 6.825878594249201, + "grad_norm": 0.15247556567192078, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8546 + }, + { + "epoch": 6.82667731629393, + "grad_norm": 0.1556181013584137, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8547 + }, + { + "epoch": 6.827476038338658, + "grad_norm": 0.1781637817621231, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8548 + }, + { + "epoch": 6.828274760383387, + "grad_norm": 0.10066398978233337, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8549 + }, + { + "epoch": 6.8290734824281145, + "grad_norm": 3.0298452377319336, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8550 + }, + { + "epoch": 6.829872204472844, + "grad_norm": 0.2745296061038971, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8551 + }, + { + "epoch": 6.830670926517572, + "grad_norm": 0.4030947983264923, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 8552 + }, + { + "epoch": 6.8314696485623, + "grad_norm": 0.11019638180732727, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8553 + }, + { + "epoch": 6.832268370607029, + "grad_norm": 0.33687886595726013, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8554 + }, + { + "epoch": 6.833067092651757, + "grad_norm": 0.164499431848526, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8555 + }, + { + "epoch": 6.833865814696486, + "grad_norm": 0.31624776124954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8556 + }, + { + "epoch": 6.834664536741214, + "grad_norm": 0.24264110624790192, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8557 + }, + { + "epoch": 6.835463258785943, + "grad_norm": 0.19310493767261505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8558 + }, + { + "epoch": 6.836261980830671, + "grad_norm": 0.2903575003147125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8559 + }, + { + "epoch": 6.8370607028754, + "grad_norm": 0.22584185004234314, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8560 + }, + { + "epoch": 6.837859424920127, + "grad_norm": 0.2400067150592804, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8561 + }, + { + "epoch": 6.838658146964856, + "grad_norm": 0.22543750703334808, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8562 + }, + { + "epoch": 6.8394568690095845, + "grad_norm": 0.2071310430765152, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8563 + }, + { + "epoch": 6.840255591054313, + "grad_norm": 0.07198980450630188, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8564 + }, + { + "epoch": 6.8410543130990416, + "grad_norm": 0.14733794331550598, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8565 + }, + { + "epoch": 6.84185303514377, + "grad_norm": 0.10259919613599777, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8566 + }, + { + "epoch": 6.842651757188499, + "grad_norm": 0.11961761116981506, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8567 + }, + { + "epoch": 6.843450479233227, + "grad_norm": 0.2714863121509552, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8568 + }, + { + "epoch": 6.844249201277956, + "grad_norm": 0.23675218224525452, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8569 + }, + { + "epoch": 6.845047923322683, + "grad_norm": 0.17738480865955353, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8570 + }, + { + "epoch": 6.845846645367412, + "grad_norm": 0.2558303475379944, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8571 + }, + { + "epoch": 6.84664536741214, + "grad_norm": 0.19869430363178253, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8572 + }, + { + "epoch": 6.847444089456869, + "grad_norm": 0.15806829929351807, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8573 + }, + { + "epoch": 6.848242811501597, + "grad_norm": 0.12016306072473526, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8574 + }, + { + "epoch": 6.849041533546326, + "grad_norm": 0.10831576585769653, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8575 + }, + { + "epoch": 6.8498402555910545, + "grad_norm": 0.06762730330228806, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8576 + }, + { + "epoch": 6.850638977635783, + "grad_norm": 0.0824534222483635, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8577 + }, + { + "epoch": 6.8514376996805115, + "grad_norm": 0.20734307169914246, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8578 + }, + { + "epoch": 6.852236421725239, + "grad_norm": 0.22174668312072754, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8579 + }, + { + "epoch": 6.853035143769968, + "grad_norm": 0.05667027458548546, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8580 + }, + { + "epoch": 6.853833865814696, + "grad_norm": 0.2844708561897278, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8581 + }, + { + "epoch": 6.854632587859425, + "grad_norm": 0.21092848479747772, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8582 + }, + { + "epoch": 6.855431309904153, + "grad_norm": 0.08843044936656952, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8583 + }, + { + "epoch": 6.856230031948882, + "grad_norm": 0.08862966299057007, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8584 + }, + { + "epoch": 6.85702875399361, + "grad_norm": 0.13263291120529175, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8585 + }, + { + "epoch": 6.857827476038339, + "grad_norm": 0.1969175636768341, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8586 + }, + { + "epoch": 6.858626198083067, + "grad_norm": 0.1299106925725937, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8587 + }, + { + "epoch": 6.859424920127795, + "grad_norm": 0.058154329657554626, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8588 + }, + { + "epoch": 6.8602236421725244, + "grad_norm": 0.06485166400671005, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8589 + }, + { + "epoch": 6.861022364217252, + "grad_norm": 6.880006313323975, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8590 + }, + { + "epoch": 6.861821086261981, + "grad_norm": 0.09929946064949036, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8591 + }, + { + "epoch": 6.862619808306709, + "grad_norm": 0.11197477579116821, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8592 + }, + { + "epoch": 6.863418530351438, + "grad_norm": 0.06740657985210419, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8593 + }, + { + "epoch": 6.864217252396166, + "grad_norm": 0.19594676792621613, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8594 + }, + { + "epoch": 6.865015974440895, + "grad_norm": 0.16844215989112854, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8595 + }, + { + "epoch": 6.865814696485623, + "grad_norm": 0.08980540931224823, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8596 + }, + { + "epoch": 6.866613418530352, + "grad_norm": 0.1263660043478012, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8597 + }, + { + "epoch": 6.86741214057508, + "grad_norm": 0.2000604271888733, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8598 + }, + { + "epoch": 6.868210862619808, + "grad_norm": 0.08987699449062347, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8599 + }, + { + "epoch": 6.8690095846645365, + "grad_norm": 0.12263453006744385, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8600 + }, + { + "epoch": 6.869808306709265, + "grad_norm": 0.1567721962928772, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8601 + }, + { + "epoch": 6.8706070287539935, + "grad_norm": 0.08756576478481293, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8602 + }, + { + "epoch": 6.871405750798722, + "grad_norm": 0.11816724389791489, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8603 + }, + { + "epoch": 6.872204472843451, + "grad_norm": 0.13798843324184418, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8604 + }, + { + "epoch": 6.873003194888179, + "grad_norm": 0.12364917248487473, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8605 + }, + { + "epoch": 6.873801916932908, + "grad_norm": 0.1200469508767128, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8606 + }, + { + "epoch": 6.874600638977636, + "grad_norm": 0.12144476920366287, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8607 + }, + { + "epoch": 6.875399361022364, + "grad_norm": 0.20083829760551453, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8608 + }, + { + "epoch": 6.876198083067092, + "grad_norm": 0.2817170023918152, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8609 + }, + { + "epoch": 6.876996805111821, + "grad_norm": 0.12137018889188766, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 8610 + }, + { + "epoch": 6.877795527156549, + "grad_norm": 0.09903489053249359, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8611 + }, + { + "epoch": 6.878594249201278, + "grad_norm": 0.17958515882492065, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8612 + }, + { + "epoch": 6.8793929712460065, + "grad_norm": 0.1041099801659584, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8613 + }, + { + "epoch": 6.880191693290735, + "grad_norm": 0.16099892556667328, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8614 + }, + { + "epoch": 6.8809904153354635, + "grad_norm": 0.061900194734334946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8615 + }, + { + "epoch": 6.881789137380192, + "grad_norm": 0.1341199427843094, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8616 + }, + { + "epoch": 6.88258785942492, + "grad_norm": 0.12683184444904327, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8617 + }, + { + "epoch": 6.883386581469648, + "grad_norm": 0.08566799014806747, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8618 + }, + { + "epoch": 6.884185303514377, + "grad_norm": 0.1616903841495514, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8619 + }, + { + "epoch": 6.884984025559105, + "grad_norm": 0.05832672119140625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8620 + }, + { + "epoch": 6.885782747603834, + "grad_norm": 0.15186071395874023, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8621 + }, + { + "epoch": 6.886581469648562, + "grad_norm": 0.16585935652256012, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8622 + }, + { + "epoch": 6.887380191693291, + "grad_norm": 0.1267954260110855, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8623 + }, + { + "epoch": 6.888178913738019, + "grad_norm": 0.22396692633628845, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8624 + }, + { + "epoch": 6.888977635782748, + "grad_norm": 0.133334219455719, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8625 + }, + { + "epoch": 6.8897763578274756, + "grad_norm": 0.1935819834470749, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8626 + }, + { + "epoch": 6.890575079872205, + "grad_norm": 0.32829585671424866, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8627 + }, + { + "epoch": 6.891373801916933, + "grad_norm": 0.231554314494133, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8628 + }, + { + "epoch": 6.892172523961661, + "grad_norm": 0.20693574845790863, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8629 + }, + { + "epoch": 6.89297124600639, + "grad_norm": 0.21037861704826355, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8630 + }, + { + "epoch": 6.893769968051118, + "grad_norm": 0.051133595407009125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8631 + }, + { + "epoch": 6.894568690095847, + "grad_norm": 0.17635062336921692, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8632 + }, + { + "epoch": 6.895367412140575, + "grad_norm": 0.14592808485031128, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8633 + }, + { + "epoch": 6.896166134185304, + "grad_norm": 0.15353697538375854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8634 + }, + { + "epoch": 6.896964856230032, + "grad_norm": 0.19556251168251038, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8635 + }, + { + "epoch": 6.897763578274761, + "grad_norm": 0.06867649406194687, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8636 + }, + { + "epoch": 6.8985623003194885, + "grad_norm": 0.15286169946193695, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8637 + }, + { + "epoch": 6.899361022364217, + "grad_norm": 0.28361746668815613, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8638 + }, + { + "epoch": 6.9001597444089455, + "grad_norm": 0.09351217746734619, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8639 + }, + { + "epoch": 6.900958466453674, + "grad_norm": 0.11050279438495636, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8640 + }, + { + "epoch": 6.901757188498403, + "grad_norm": 0.1648218333721161, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8641 + }, + { + "epoch": 6.902555910543131, + "grad_norm": 0.10323848575353622, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8642 + }, + { + "epoch": 6.90335463258786, + "grad_norm": 0.14925505220890045, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8643 + }, + { + "epoch": 6.904153354632588, + "grad_norm": 0.05877414718270302, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8644 + }, + { + "epoch": 6.904952076677317, + "grad_norm": 0.3324354290962219, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8645 + }, + { + "epoch": 6.905750798722044, + "grad_norm": 0.22756889462471008, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8646 + }, + { + "epoch": 6.906549520766773, + "grad_norm": 0.1040947288274765, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8647 + }, + { + "epoch": 6.907348242811501, + "grad_norm": 0.1310190111398697, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8648 + }, + { + "epoch": 6.90814696485623, + "grad_norm": 0.09484609216451645, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8649 + }, + { + "epoch": 6.9089456869009584, + "grad_norm": 0.13337384164333344, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8650 + }, + { + "epoch": 6.909744408945687, + "grad_norm": 0.31157273054122925, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8651 + }, + { + "epoch": 6.9105431309904155, + "grad_norm": 0.15081669390201569, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8652 + }, + { + "epoch": 6.911341853035144, + "grad_norm": 0.14120221138000488, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8653 + }, + { + "epoch": 6.912140575079873, + "grad_norm": 0.6128084659576416, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8654 + }, + { + "epoch": 6.9129392971246, + "grad_norm": 0.6915252208709717, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 8655 + }, + { + "epoch": 6.913738019169329, + "grad_norm": 0.7245156168937683, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 8656 + }, + { + "epoch": 6.914536741214057, + "grad_norm": 0.8400923013687134, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 8657 + }, + { + "epoch": 6.915335463258786, + "grad_norm": 0.3218044340610504, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 8658 + }, + { + "epoch": 6.916134185303514, + "grad_norm": 0.3119533061981201, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 8659 + }, + { + "epoch": 6.916932907348243, + "grad_norm": 0.2192138433456421, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 8660 + }, + { + "epoch": 6.917731629392971, + "grad_norm": 0.36212611198425293, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 8661 + }, + { + "epoch": 6.9185303514377, + "grad_norm": 0.13674713671207428, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8662 + }, + { + "epoch": 6.919329073482428, + "grad_norm": 0.24960070848464966, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 8663 + }, + { + "epoch": 6.920127795527156, + "grad_norm": 0.16797062754631042, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8664 + }, + { + "epoch": 6.9209265175718855, + "grad_norm": 0.23811157047748566, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 8665 + }, + { + "epoch": 6.921725239616613, + "grad_norm": 0.25372570753097534, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8666 + }, + { + "epoch": 6.922523961661342, + "grad_norm": 0.13954615592956543, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8667 + }, + { + "epoch": 6.92332268370607, + "grad_norm": 0.17769959568977356, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8668 + }, + { + "epoch": 6.924121405750799, + "grad_norm": 0.14327546954154968, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 8669 + }, + { + "epoch": 6.924920127795527, + "grad_norm": 0.07454083859920502, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 8670 + }, + { + "epoch": 6.925718849840256, + "grad_norm": 0.18561266362667084, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8671 + }, + { + "epoch": 6.926517571884984, + "grad_norm": 0.11927005648612976, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 8672 + }, + { + "epoch": 6.927316293929713, + "grad_norm": 0.06790865212678909, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8673 + }, + { + "epoch": 6.928115015974441, + "grad_norm": 0.22627630829811096, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8674 + }, + { + "epoch": 6.928913738019169, + "grad_norm": 0.21341092884540558, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8675 + }, + { + "epoch": 6.9297124600638975, + "grad_norm": 0.19292457401752472, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 8676 + }, + { + "epoch": 6.930511182108626, + "grad_norm": 0.15046356618404388, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8677 + }, + { + "epoch": 6.931309904153355, + "grad_norm": 0.13845203816890717, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8678 + }, + { + "epoch": 6.932108626198083, + "grad_norm": 0.18034739792346954, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8679 + }, + { + "epoch": 6.932907348242812, + "grad_norm": 0.3970269560813904, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8680 + }, + { + "epoch": 6.93370607028754, + "grad_norm": 0.133075550198555, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8681 + }, + { + "epoch": 6.934504792332269, + "grad_norm": 0.13149690628051758, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 8682 + }, + { + "epoch": 6.935303514376997, + "grad_norm": 0.1332010179758072, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8683 + }, + { + "epoch": 6.936102236421725, + "grad_norm": 0.13125883042812347, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 8684 + }, + { + "epoch": 6.936900958466453, + "grad_norm": 0.5500382781028748, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 8685 + }, + { + "epoch": 6.937699680511182, + "grad_norm": 0.09766851365566254, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8686 + }, + { + "epoch": 6.93849840255591, + "grad_norm": 0.10732626169919968, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8687 + }, + { + "epoch": 6.939297124600639, + "grad_norm": 0.10059154033660889, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8688 + }, + { + "epoch": 6.9400958466453675, + "grad_norm": 0.09518695622682571, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8689 + }, + { + "epoch": 6.940894568690096, + "grad_norm": 0.1279720813035965, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8690 + }, + { + "epoch": 6.9416932907348246, + "grad_norm": 0.0997946485877037, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8691 + }, + { + "epoch": 6.942492012779553, + "grad_norm": 0.08584152907133102, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8692 + }, + { + "epoch": 6.943290734824281, + "grad_norm": 0.06987651437520981, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8693 + }, + { + "epoch": 6.944089456869009, + "grad_norm": 0.10446512699127197, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8694 + }, + { + "epoch": 6.944888178913738, + "grad_norm": 0.08535288274288177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8695 + }, + { + "epoch": 6.945686900958466, + "grad_norm": 0.15912187099456787, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8696 + }, + { + "epoch": 6.946485623003195, + "grad_norm": 0.20139484107494354, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8697 + }, + { + "epoch": 6.947284345047923, + "grad_norm": 0.10153409093618393, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8698 + }, + { + "epoch": 6.948083067092652, + "grad_norm": 0.04925902560353279, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8699 + }, + { + "epoch": 6.94888178913738, + "grad_norm": 0.13896742463111877, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8700 + }, + { + "epoch": 6.949680511182109, + "grad_norm": 0.07297761738300323, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8701 + }, + { + "epoch": 6.950479233226837, + "grad_norm": 0.09260845929384232, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8702 + }, + { + "epoch": 6.951277955271565, + "grad_norm": 0.11840535700321198, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8703 + }, + { + "epoch": 6.952076677316294, + "grad_norm": 0.17365501821041107, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8704 + }, + { + "epoch": 6.952875399361022, + "grad_norm": 0.1369183212518692, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8705 + }, + { + "epoch": 6.953674121405751, + "grad_norm": 0.11277196556329727, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8706 + }, + { + "epoch": 6.954472843450479, + "grad_norm": 0.11032512784004211, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8707 + }, + { + "epoch": 6.955271565495208, + "grad_norm": 0.12437347322702408, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8708 + }, + { + "epoch": 6.956070287539936, + "grad_norm": 0.08772306144237518, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8709 + }, + { + "epoch": 6.956869009584665, + "grad_norm": 0.05245213583111763, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8710 + }, + { + "epoch": 6.957667731629393, + "grad_norm": 0.1591174304485321, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8711 + }, + { + "epoch": 6.958466453674122, + "grad_norm": 0.21121510863304138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8712 + }, + { + "epoch": 6.9592651757188495, + "grad_norm": 0.11379709839820862, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8713 + }, + { + "epoch": 6.960063897763578, + "grad_norm": 0.10083793848752975, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8714 + }, + { + "epoch": 6.960862619808307, + "grad_norm": 0.0790674164891243, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8715 + }, + { + "epoch": 6.961661341853035, + "grad_norm": 0.13917089998722076, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8716 + }, + { + "epoch": 6.962460063897764, + "grad_norm": 0.18794408440589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8717 + }, + { + "epoch": 6.963258785942492, + "grad_norm": 0.10725098103284836, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8718 + }, + { + "epoch": 6.964057507987221, + "grad_norm": 0.14577186107635498, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8719 + }, + { + "epoch": 6.964856230031949, + "grad_norm": 0.06711703538894653, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8720 + }, + { + "epoch": 6.965654952076678, + "grad_norm": 0.20572635531425476, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8721 + }, + { + "epoch": 6.966453674121405, + "grad_norm": 0.13693936169147491, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8722 + }, + { + "epoch": 6.967252396166134, + "grad_norm": 0.05642275512218475, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8723 + }, + { + "epoch": 6.968051118210862, + "grad_norm": 0.09080768376588821, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8724 + }, + { + "epoch": 6.968849840255591, + "grad_norm": 0.05295126140117645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8725 + }, + { + "epoch": 6.9696485623003195, + "grad_norm": 0.11833932250738144, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8726 + }, + { + "epoch": 6.970447284345048, + "grad_norm": 0.12110085785388947, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8727 + }, + { + "epoch": 6.9712460063897765, + "grad_norm": 0.10044527053833008, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8728 + }, + { + "epoch": 6.972044728434505, + "grad_norm": 0.13638640940189362, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8729 + }, + { + "epoch": 6.972843450479234, + "grad_norm": 0.18118594586849213, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8730 + }, + { + "epoch": 6.973642172523961, + "grad_norm": 0.1394396871328354, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8731 + }, + { + "epoch": 6.97444089456869, + "grad_norm": 0.14276480674743652, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8732 + }, + { + "epoch": 6.975239616613418, + "grad_norm": 0.2213817834854126, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8733 + }, + { + "epoch": 6.976038338658147, + "grad_norm": 0.11497826874256134, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8734 + }, + { + "epoch": 6.976837060702875, + "grad_norm": 0.11436138302087784, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8735 + }, + { + "epoch": 6.977635782747604, + "grad_norm": 0.08433762192726135, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 8736 + }, + { + "epoch": 6.978434504792332, + "grad_norm": 0.1584242880344391, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8737 + }, + { + "epoch": 6.979233226837061, + "grad_norm": 0.09111067652702332, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8738 + }, + { + "epoch": 6.9800319488817895, + "grad_norm": 0.09075064212083817, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8739 + }, + { + "epoch": 6.980830670926517, + "grad_norm": 0.08456333726644516, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8740 + }, + { + "epoch": 6.981629392971246, + "grad_norm": 0.08090690523386002, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8741 + }, + { + "epoch": 6.982428115015974, + "grad_norm": 0.42019179463386536, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8742 + }, + { + "epoch": 6.983226837060703, + "grad_norm": 0.119536854326725, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8743 + }, + { + "epoch": 6.984025559105431, + "grad_norm": 0.08138761669397354, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8744 + }, + { + "epoch": 6.98482428115016, + "grad_norm": 0.5337278246879578, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8745 + }, + { + "epoch": 6.985623003194888, + "grad_norm": 0.1773308366537094, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8746 + }, + { + "epoch": 6.986421725239617, + "grad_norm": 0.10939478129148483, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8747 + }, + { + "epoch": 6.987220447284345, + "grad_norm": 0.18635793030261993, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8748 + }, + { + "epoch": 6.988019169329074, + "grad_norm": 0.11675454676151276, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8749 + }, + { + "epoch": 6.988817891373802, + "grad_norm": 0.11787068843841553, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8750 + }, + { + "epoch": 6.98961661341853, + "grad_norm": 0.2457057386636734, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8751 + }, + { + "epoch": 6.9904153354632586, + "grad_norm": 0.05914906784892082, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 8752 + }, + { + "epoch": 6.991214057507987, + "grad_norm": 0.1494094878435135, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8753 + }, + { + "epoch": 6.992012779552716, + "grad_norm": 0.14485910534858704, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8754 + }, + { + "epoch": 6.992811501597444, + "grad_norm": 1.2348047494888306, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8755 + }, + { + "epoch": 6.993610223642173, + "grad_norm": 0.1546175330877304, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8756 + }, + { + "epoch": 6.994408945686901, + "grad_norm": 0.13474640250205994, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8757 + }, + { + "epoch": 6.99520766773163, + "grad_norm": 0.5535407662391663, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8758 + }, + { + "epoch": 6.996006389776358, + "grad_norm": 0.10516832023859024, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8759 + }, + { + "epoch": 6.996805111821086, + "grad_norm": 0.07872752100229263, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8760 + }, + { + "epoch": 6.997603833865814, + "grad_norm": 0.08130715042352676, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8761 + }, + { + "epoch": 6.998402555910543, + "grad_norm": 0.09496142715215683, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8762 + }, + { + "epoch": 6.9992012779552715, + "grad_norm": 0.06645053625106812, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8763 + }, + { + "epoch": 7.0, + "grad_norm": 0.07332758605480194, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8764 + }, + { + "epoch": 7.0007987220447285, + "grad_norm": 0.09108536690473557, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8765 + }, + { + "epoch": 7.001597444089457, + "grad_norm": 0.13202883303165436, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8766 + }, + { + "epoch": 7.002396166134186, + "grad_norm": 0.09079252928495407, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 8767 + }, + { + "epoch": 7.003194888178914, + "grad_norm": 0.1004822626709938, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8768 + }, + { + "epoch": 7.003993610223642, + "grad_norm": 0.05096781253814697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8769 + }, + { + "epoch": 7.00479233226837, + "grad_norm": 0.14213396608829498, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8770 + }, + { + "epoch": 7.005591054313099, + "grad_norm": 0.11614344269037247, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8771 + }, + { + "epoch": 7.006389776357827, + "grad_norm": 0.1144147664308548, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8772 + }, + { + "epoch": 7.007188498402556, + "grad_norm": 0.1504330188035965, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8773 + }, + { + "epoch": 7.007987220447284, + "grad_norm": 0.10443079471588135, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8774 + }, + { + "epoch": 7.008785942492013, + "grad_norm": 0.166890949010849, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8775 + }, + { + "epoch": 7.0095846645367414, + "grad_norm": 0.12496565282344818, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8776 + }, + { + "epoch": 7.01038338658147, + "grad_norm": 0.12851381301879883, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8777 + }, + { + "epoch": 7.0111821086261985, + "grad_norm": 0.20198717713356018, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8778 + }, + { + "epoch": 7.011980830670926, + "grad_norm": 0.10324864089488983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8779 + }, + { + "epoch": 7.012779552715655, + "grad_norm": 0.12864094972610474, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8780 + }, + { + "epoch": 7.013578274760383, + "grad_norm": 0.11301549524068832, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8781 + }, + { + "epoch": 7.014376996805112, + "grad_norm": 0.13162367045879364, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8782 + }, + { + "epoch": 7.01517571884984, + "grad_norm": 0.1574760377407074, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8783 + }, + { + "epoch": 7.015974440894569, + "grad_norm": 0.07471634447574615, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8784 + }, + { + "epoch": 7.016773162939297, + "grad_norm": 0.09653516113758087, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8785 + }, + { + "epoch": 7.017571884984026, + "grad_norm": 0.13719993829727173, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8786 + }, + { + "epoch": 7.018370607028754, + "grad_norm": 0.10545443743467331, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8787 + }, + { + "epoch": 7.019169329073482, + "grad_norm": 0.1147511675953865, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8788 + }, + { + "epoch": 7.0199680511182105, + "grad_norm": 0.14005234837532043, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8789 + }, + { + "epoch": 7.020766773162939, + "grad_norm": 0.36956554651260376, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8790 + }, + { + "epoch": 7.021565495207668, + "grad_norm": 0.1384177953004837, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8791 + }, + { + "epoch": 7.022364217252396, + "grad_norm": 0.062106356024742126, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8792 + }, + { + "epoch": 7.023162939297125, + "grad_norm": 0.14074385166168213, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8793 + }, + { + "epoch": 7.023961661341853, + "grad_norm": 0.18152809143066406, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8794 + }, + { + "epoch": 7.024760383386582, + "grad_norm": 0.11607832461595535, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8795 + }, + { + "epoch": 7.02555910543131, + "grad_norm": 0.06603241711854935, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8796 + }, + { + "epoch": 7.026357827476039, + "grad_norm": 0.08846289664506912, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8797 + }, + { + "epoch": 7.027156549520766, + "grad_norm": 0.09882134944200516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8798 + }, + { + "epoch": 7.027955271565495, + "grad_norm": 0.11535032093524933, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8799 + }, + { + "epoch": 7.0287539936102235, + "grad_norm": 0.10153281688690186, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8800 + }, + { + "epoch": 7.029552715654952, + "grad_norm": 0.11195418983697891, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8801 + }, + { + "epoch": 7.0303514376996805, + "grad_norm": 0.5721603035926819, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8802 + }, + { + "epoch": 7.031150159744409, + "grad_norm": 0.18006286025047302, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8803 + }, + { + "epoch": 7.031948881789138, + "grad_norm": 0.16561086475849152, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8804 + }, + { + "epoch": 7.032747603833866, + "grad_norm": 0.11010444164276123, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8805 + }, + { + "epoch": 7.033546325878595, + "grad_norm": 0.17741475999355316, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8806 + }, + { + "epoch": 7.034345047923322, + "grad_norm": 0.09941161423921585, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 8807 + }, + { + "epoch": 7.035143769968051, + "grad_norm": 0.20474617183208466, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8808 + }, + { + "epoch": 7.035942492012779, + "grad_norm": 0.07972154021263123, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8809 + }, + { + "epoch": 7.036741214057508, + "grad_norm": 0.17856109142303467, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8810 + }, + { + "epoch": 7.037539936102236, + "grad_norm": 0.1276514083147049, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8811 + }, + { + "epoch": 7.038338658146965, + "grad_norm": 0.08009849488735199, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 8812 + }, + { + "epoch": 7.039137380191693, + "grad_norm": 0.09832913428544998, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8813 + }, + { + "epoch": 7.039936102236422, + "grad_norm": 0.06454402953386307, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8814 + }, + { + "epoch": 7.0407348242811505, + "grad_norm": 0.20843401551246643, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8815 + }, + { + "epoch": 7.041533546325879, + "grad_norm": 0.14909301698207855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8816 + }, + { + "epoch": 7.042332268370607, + "grad_norm": 0.08815812319517136, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8817 + }, + { + "epoch": 7.043130990415335, + "grad_norm": 0.18957766890525818, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8818 + }, + { + "epoch": 7.043929712460064, + "grad_norm": 0.33018213510513306, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8819 + }, + { + "epoch": 7.044728434504792, + "grad_norm": 0.11069374531507492, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8820 + }, + { + "epoch": 7.045527156549521, + "grad_norm": 0.3001084625720978, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8821 + }, + { + "epoch": 7.046325878594249, + "grad_norm": 0.0704922303557396, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8822 + }, + { + "epoch": 7.047124600638978, + "grad_norm": 0.08537211269140244, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8823 + }, + { + "epoch": 7.047923322683706, + "grad_norm": 0.08765899389982224, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8824 + }, + { + "epoch": 7.048722044728435, + "grad_norm": 0.14218255877494812, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8825 + }, + { + "epoch": 7.0495207667731625, + "grad_norm": 0.08026671409606934, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8826 + }, + { + "epoch": 7.050319488817891, + "grad_norm": 0.07170549035072327, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8827 + }, + { + "epoch": 7.05111821086262, + "grad_norm": 1.2578401565551758, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8828 + }, + { + "epoch": 7.051916932907348, + "grad_norm": 0.20149891078472137, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8829 + }, + { + "epoch": 7.052715654952077, + "grad_norm": 0.18734677135944366, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8830 + }, + { + "epoch": 7.053514376996805, + "grad_norm": 0.08732877671718597, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8831 + }, + { + "epoch": 7.054313099041534, + "grad_norm": 0.1895754486322403, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8832 + }, + { + "epoch": 7.055111821086262, + "grad_norm": 0.06839644908905029, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8833 + }, + { + "epoch": 7.055910543130991, + "grad_norm": 4.666222095489502, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8834 + }, + { + "epoch": 7.056709265175719, + "grad_norm": 0.2801821231842041, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8835 + }, + { + "epoch": 7.057507987220447, + "grad_norm": 0.3428499102592468, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8836 + }, + { + "epoch": 7.0583067092651754, + "grad_norm": 0.16896478831768036, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8837 + }, + { + "epoch": 7.059105431309904, + "grad_norm": 1.21062171459198, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8838 + }, + { + "epoch": 7.0599041533546325, + "grad_norm": 0.20507270097732544, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8839 + }, + { + "epoch": 7.060702875399361, + "grad_norm": 0.34736308455467224, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8840 + }, + { + "epoch": 7.06150159744409, + "grad_norm": 0.13628798723220825, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8841 + }, + { + "epoch": 7.062300319488818, + "grad_norm": 0.3212411403656006, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8842 + }, + { + "epoch": 7.063099041533547, + "grad_norm": 0.23049144446849823, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8843 + }, + { + "epoch": 7.063897763578275, + "grad_norm": 0.2785285413265228, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8844 + }, + { + "epoch": 7.064696485623003, + "grad_norm": 0.32158368825912476, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8845 + }, + { + "epoch": 7.065495207667731, + "grad_norm": 0.40443500876426697, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8846 + }, + { + "epoch": 7.06629392971246, + "grad_norm": 0.20072752237319946, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8847 + }, + { + "epoch": 7.067092651757188, + "grad_norm": 0.38166266679763794, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8848 + }, + { + "epoch": 7.067891373801917, + "grad_norm": 0.2771472930908203, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8849 + }, + { + "epoch": 7.068690095846645, + "grad_norm": 0.10485964268445969, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8850 + }, + { + "epoch": 7.069488817891374, + "grad_norm": 0.17424215376377106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8851 + }, + { + "epoch": 7.0702875399361025, + "grad_norm": 0.0972314327955246, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8852 + }, + { + "epoch": 7.071086261980831, + "grad_norm": 0.18021832406520844, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8853 + }, + { + "epoch": 7.0718849840255595, + "grad_norm": 0.08820143342018127, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8854 + }, + { + "epoch": 7.072683706070287, + "grad_norm": 0.1785898506641388, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8855 + }, + { + "epoch": 7.073482428115016, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8856 + }, + { + "epoch": 7.074281150159744, + "grad_norm": 0.1787438541650772, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8857 + }, + { + "epoch": 7.075079872204473, + "grad_norm": 0.16761353611946106, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8858 + }, + { + "epoch": 7.075878594249201, + "grad_norm": 0.5075165629386902, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8859 + }, + { + "epoch": 7.07667731629393, + "grad_norm": 0.13462364673614502, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8860 + }, + { + "epoch": 7.077476038338658, + "grad_norm": 0.20478707551956177, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8861 + }, + { + "epoch": 7.078274760383387, + "grad_norm": 0.14689947664737701, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8862 + }, + { + "epoch": 7.079073482428115, + "grad_norm": 0.36265847086906433, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8863 + }, + { + "epoch": 7.079872204472843, + "grad_norm": 0.18443043529987335, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8864 + }, + { + "epoch": 7.080670926517572, + "grad_norm": 0.04789111018180847, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8865 + }, + { + "epoch": 7.0814696485623, + "grad_norm": 0.18024222552776337, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8866 + }, + { + "epoch": 7.082268370607029, + "grad_norm": 0.08901690691709518, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8867 + }, + { + "epoch": 7.083067092651757, + "grad_norm": 0.20689153671264648, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8868 + }, + { + "epoch": 7.083865814696486, + "grad_norm": 0.15572768449783325, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8869 + }, + { + "epoch": 7.084664536741214, + "grad_norm": 0.2915050685405731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8870 + }, + { + "epoch": 7.085463258785943, + "grad_norm": 0.12404290586709976, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8871 + }, + { + "epoch": 7.086261980830671, + "grad_norm": 0.19628335535526276, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8872 + }, + { + "epoch": 7.0870607028754, + "grad_norm": 0.6693617105484009, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8873 + }, + { + "epoch": 7.087859424920127, + "grad_norm": 0.21526481211185455, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8874 + }, + { + "epoch": 7.088658146964856, + "grad_norm": 0.2779954969882965, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8875 + }, + { + "epoch": 7.0894568690095845, + "grad_norm": 0.14111320674419403, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8876 + }, + { + "epoch": 7.090255591054313, + "grad_norm": 0.26465079188346863, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8877 + }, + { + "epoch": 7.0910543130990416, + "grad_norm": 0.12354349344968796, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8878 + }, + { + "epoch": 7.09185303514377, + "grad_norm": 0.18360896408557892, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8879 + }, + { + "epoch": 7.092651757188499, + "grad_norm": 0.26844218373298645, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8880 + }, + { + "epoch": 7.093450479233227, + "grad_norm": 0.34032055735588074, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8881 + }, + { + "epoch": 7.094249201277956, + "grad_norm": 0.2372630089521408, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8882 + }, + { + "epoch": 7.095047923322683, + "grad_norm": 0.4134571850299835, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8883 + }, + { + "epoch": 7.095846645367412, + "grad_norm": 0.21220949292182922, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8884 + }, + { + "epoch": 7.09664536741214, + "grad_norm": 0.20073527097702026, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8885 + }, + { + "epoch": 7.097444089456869, + "grad_norm": 0.1583309918642044, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8886 + }, + { + "epoch": 7.098242811501597, + "grad_norm": 0.4032151401042938, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8887 + }, + { + "epoch": 7.099041533546326, + "grad_norm": 0.09527560323476791, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 8888 + }, + { + "epoch": 7.0998402555910545, + "grad_norm": 0.2630043625831604, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8889 + }, + { + "epoch": 7.100638977635783, + "grad_norm": 0.06699138134717941, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8890 + }, + { + "epoch": 7.1014376996805115, + "grad_norm": 0.34307003021240234, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8891 + }, + { + "epoch": 7.102236421725239, + "grad_norm": 0.24538451433181763, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8892 + }, + { + "epoch": 7.103035143769968, + "grad_norm": 0.2794513702392578, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8893 + }, + { + "epoch": 7.103833865814696, + "grad_norm": 0.20586012303829193, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8894 + }, + { + "epoch": 7.104632587859425, + "grad_norm": 0.22349807620048523, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8895 + }, + { + "epoch": 7.105431309904153, + "grad_norm": 0.31171584129333496, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8896 + }, + { + "epoch": 7.106230031948882, + "grad_norm": 0.07461030781269073, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8897 + }, + { + "epoch": 7.10702875399361, + "grad_norm": 0.24280597269535065, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8898 + }, + { + "epoch": 7.107827476038339, + "grad_norm": 0.13005708158016205, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8899 + }, + { + "epoch": 7.108626198083067, + "grad_norm": 0.24730080366134644, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8900 + }, + { + "epoch": 7.109424920127796, + "grad_norm": 1.287341833114624, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8901 + }, + { + "epoch": 7.110223642172524, + "grad_norm": 0.15945735573768616, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8902 + }, + { + "epoch": 7.111022364217252, + "grad_norm": 0.09943541884422302, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8903 + }, + { + "epoch": 7.111821086261981, + "grad_norm": 0.12183468043804169, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8904 + }, + { + "epoch": 7.112619808306709, + "grad_norm": 0.11859191954135895, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8905 + }, + { + "epoch": 7.113418530351438, + "grad_norm": 0.27701425552368164, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8906 + }, + { + "epoch": 7.114217252396166, + "grad_norm": 0.14724725484848022, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8907 + }, + { + "epoch": 7.115015974440895, + "grad_norm": 0.1342400461435318, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8908 + }, + { + "epoch": 7.115814696485623, + "grad_norm": 0.15474970638751984, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8909 + }, + { + "epoch": 7.116613418530352, + "grad_norm": 0.1276721954345703, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8910 + }, + { + "epoch": 7.11741214057508, + "grad_norm": 0.14511124789714813, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8911 + }, + { + "epoch": 7.118210862619808, + "grad_norm": 0.10112027823925018, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8912 + }, + { + "epoch": 7.1190095846645365, + "grad_norm": 0.17296795547008514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8913 + }, + { + "epoch": 7.119808306709265, + "grad_norm": 0.09542828798294067, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8914 + }, + { + "epoch": 7.1206070287539935, + "grad_norm": 0.17453183233737946, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8915 + }, + { + "epoch": 7.121405750798722, + "grad_norm": 0.13417603075504303, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8916 + }, + { + "epoch": 7.122204472843451, + "grad_norm": 0.26239508390426636, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8917 + }, + { + "epoch": 7.123003194888179, + "grad_norm": 0.13963834941387177, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8918 + }, + { + "epoch": 7.123801916932908, + "grad_norm": 0.18642054498195648, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8919 + }, + { + "epoch": 7.124600638977636, + "grad_norm": 0.17754590511322021, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8920 + }, + { + "epoch": 7.125399361022364, + "grad_norm": 0.1010628268122673, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8921 + }, + { + "epoch": 7.126198083067092, + "grad_norm": 0.1621905416250229, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8922 + }, + { + "epoch": 7.126996805111821, + "grad_norm": 0.3069966733455658, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8923 + }, + { + "epoch": 7.127795527156549, + "grad_norm": 0.2312333881855011, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8924 + }, + { + "epoch": 7.128594249201278, + "grad_norm": 0.20297785103321075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8925 + }, + { + "epoch": 7.1293929712460065, + "grad_norm": 0.18856601417064667, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8926 + }, + { + "epoch": 7.130191693290735, + "grad_norm": 0.19353985786437988, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8927 + }, + { + "epoch": 7.1309904153354635, + "grad_norm": 0.08276687562465668, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8928 + }, + { + "epoch": 7.131789137380192, + "grad_norm": 0.31372779607772827, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8929 + }, + { + "epoch": 7.13258785942492, + "grad_norm": 0.10208959877490997, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8930 + }, + { + "epoch": 7.133386581469648, + "grad_norm": 0.1636659801006317, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8931 + }, + { + "epoch": 7.134185303514377, + "grad_norm": 0.14321425557136536, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8932 + }, + { + "epoch": 7.134984025559105, + "grad_norm": 0.08438511192798615, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8933 + }, + { + "epoch": 7.135782747603834, + "grad_norm": 0.17451012134552002, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8934 + }, + { + "epoch": 7.136581469648562, + "grad_norm": 0.06913795322179794, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8935 + }, + { + "epoch": 7.137380191693291, + "grad_norm": 0.14176666736602783, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8936 + }, + { + "epoch": 7.138178913738019, + "grad_norm": 0.15005643665790558, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8937 + }, + { + "epoch": 7.138977635782748, + "grad_norm": 0.08884457498788834, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8938 + }, + { + "epoch": 7.139776357827476, + "grad_norm": 0.19651612639427185, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8939 + }, + { + "epoch": 7.140575079872204, + "grad_norm": 0.12419132143259048, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8940 + }, + { + "epoch": 7.141373801916933, + "grad_norm": 0.08800125867128372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8941 + }, + { + "epoch": 7.142172523961661, + "grad_norm": 0.12308578193187714, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8942 + }, + { + "epoch": 7.14297124600639, + "grad_norm": 0.06376682221889496, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8943 + }, + { + "epoch": 7.143769968051118, + "grad_norm": 0.08467467129230499, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8944 + }, + { + "epoch": 7.144568690095847, + "grad_norm": 0.05492696538567543, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8945 + }, + { + "epoch": 7.145367412140575, + "grad_norm": 0.12659363448619843, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8946 + }, + { + "epoch": 7.146166134185304, + "grad_norm": 0.11025204509496689, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8947 + }, + { + "epoch": 7.146964856230032, + "grad_norm": 0.03672007843852043, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8948 + }, + { + "epoch": 7.147763578274761, + "grad_norm": 0.06386546790599823, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8949 + }, + { + "epoch": 7.1485623003194885, + "grad_norm": 0.05484751984477043, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8950 + }, + { + "epoch": 7.149361022364217, + "grad_norm": 0.08663280308246613, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8951 + }, + { + "epoch": 7.1501597444089455, + "grad_norm": 0.10515031963586807, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8952 + }, + { + "epoch": 7.150958466453674, + "grad_norm": 0.05844622105360031, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8953 + }, + { + "epoch": 7.151757188498403, + "grad_norm": 0.061575960367918015, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8954 + }, + { + "epoch": 7.152555910543131, + "grad_norm": 0.30169913172721863, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8955 + }, + { + "epoch": 7.15335463258786, + "grad_norm": 0.15433792769908905, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8956 + }, + { + "epoch": 7.154153354632588, + "grad_norm": 0.11872339993715286, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8957 + }, + { + "epoch": 7.154952076677317, + "grad_norm": 0.4086587131023407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8958 + }, + { + "epoch": 7.155750798722044, + "grad_norm": 0.0976172536611557, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8959 + }, + { + "epoch": 7.156549520766773, + "grad_norm": 0.11132699996232986, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8960 + }, + { + "epoch": 7.157348242811501, + "grad_norm": 0.11129645258188248, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8961 + }, + { + "epoch": 7.15814696485623, + "grad_norm": 0.09004200249910355, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8962 + }, + { + "epoch": 7.1589456869009584, + "grad_norm": 0.1225908026099205, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8963 + }, + { + "epoch": 7.159744408945687, + "grad_norm": 0.10531286895275116, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8964 + }, + { + "epoch": 7.1605431309904155, + "grad_norm": 0.1054515391588211, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8965 + }, + { + "epoch": 7.161341853035144, + "grad_norm": 0.11718834936618805, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8966 + }, + { + "epoch": 7.162140575079873, + "grad_norm": 0.11314168572425842, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8967 + }, + { + "epoch": 7.1629392971246, + "grad_norm": 0.1017487570643425, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8968 + }, + { + "epoch": 7.163738019169329, + "grad_norm": 0.05381032079458237, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8969 + }, + { + "epoch": 7.164536741214057, + "grad_norm": 0.1527879238128662, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8970 + }, + { + "epoch": 7.165335463258786, + "grad_norm": 0.05352415144443512, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8971 + }, + { + "epoch": 7.166134185303514, + "grad_norm": 0.17179784178733826, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8972 + }, + { + "epoch": 7.166932907348243, + "grad_norm": 0.24629469215869904, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8973 + }, + { + "epoch": 7.167731629392971, + "grad_norm": 0.11276146024465561, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8974 + }, + { + "epoch": 7.1685303514377, + "grad_norm": 0.0927032083272934, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8975 + }, + { + "epoch": 7.169329073482428, + "grad_norm": 0.0978626236319542, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8976 + }, + { + "epoch": 7.170127795527157, + "grad_norm": 0.12577946484088898, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8977 + }, + { + "epoch": 7.170926517571885, + "grad_norm": 0.1014678105711937, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8978 + }, + { + "epoch": 7.171725239616613, + "grad_norm": 0.08706190437078476, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8979 + }, + { + "epoch": 7.172523961661342, + "grad_norm": 0.06214338168501854, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8980 + }, + { + "epoch": 7.17332268370607, + "grad_norm": 0.08223161101341248, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8981 + }, + { + "epoch": 7.174121405750799, + "grad_norm": 0.3143157362937927, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8982 + }, + { + "epoch": 7.174920127795527, + "grad_norm": 0.16466212272644043, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8983 + }, + { + "epoch": 7.175718849840256, + "grad_norm": 0.13650043308734894, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8984 + }, + { + "epoch": 7.176517571884984, + "grad_norm": 0.05605694651603699, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8985 + }, + { + "epoch": 7.177316293929713, + "grad_norm": 0.12153269350528717, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8986 + }, + { + "epoch": 7.178115015974441, + "grad_norm": 0.07390844076871872, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8987 + }, + { + "epoch": 7.178913738019169, + "grad_norm": 0.05618416517972946, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8988 + }, + { + "epoch": 7.1797124600638975, + "grad_norm": 0.24178527295589447, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8989 + }, + { + "epoch": 7.180511182108626, + "grad_norm": 0.06414328515529633, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8990 + }, + { + "epoch": 7.181309904153355, + "grad_norm": 0.05483662337064743, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8991 + }, + { + "epoch": 7.182108626198083, + "grad_norm": 0.05821032077074051, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8992 + }, + { + "epoch": 7.182907348242812, + "grad_norm": 0.04972073435783386, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8993 + }, + { + "epoch": 7.18370607028754, + "grad_norm": 0.13323748111724854, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8994 + }, + { + "epoch": 7.184504792332269, + "grad_norm": 0.1341763287782669, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8995 + }, + { + "epoch": 7.185303514376997, + "grad_norm": 0.1092606782913208, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8996 + }, + { + "epoch": 7.186102236421725, + "grad_norm": 0.10611139982938766, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8997 + }, + { + "epoch": 7.186900958466453, + "grad_norm": 0.0810476616024971, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8998 + }, + { + "epoch": 7.187699680511182, + "grad_norm": 0.053938958793878555, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8999 + }, + { + "epoch": 7.18849840255591, + "grad_norm": 0.08355431258678436, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9000 + }, + { + "epoch": 7.189297124600639, + "grad_norm": 0.0719372034072876, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9001 + }, + { + "epoch": 7.1900958466453675, + "grad_norm": 0.0541183203458786, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9002 + }, + { + "epoch": 7.190894568690096, + "grad_norm": 0.08637872338294983, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9003 + }, + { + "epoch": 7.1916932907348246, + "grad_norm": 0.0900801345705986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9004 + }, + { + "epoch": 7.192492012779553, + "grad_norm": 0.08778835088014603, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9005 + }, + { + "epoch": 7.193290734824281, + "grad_norm": 0.13946911692619324, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9006 + }, + { + "epoch": 7.194089456869009, + "grad_norm": 0.20089952647686005, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9007 + }, + { + "epoch": 7.194888178913738, + "grad_norm": 0.20472672581672668, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 9008 + }, + { + "epoch": 7.195686900958466, + "grad_norm": 0.09503829479217529, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9009 + }, + { + "epoch": 7.196485623003195, + "grad_norm": 0.057289477437734604, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9010 + }, + { + "epoch": 7.197284345047923, + "grad_norm": 0.18998531997203827, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9011 + }, + { + "epoch": 7.198083067092652, + "grad_norm": 0.12228010594844818, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9012 + }, + { + "epoch": 7.19888178913738, + "grad_norm": 0.0855637639760971, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9013 + }, + { + "epoch": 7.199680511182109, + "grad_norm": 0.08341407775878906, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9014 + }, + { + "epoch": 7.2004792332268375, + "grad_norm": 0.06806697696447372, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9015 + }, + { + "epoch": 7.201277955271565, + "grad_norm": 0.06730692833662033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9016 + }, + { + "epoch": 7.202076677316294, + "grad_norm": 0.04983438923954964, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9017 + }, + { + "epoch": 7.202875399361022, + "grad_norm": 0.09153386205434799, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9018 + }, + { + "epoch": 7.203674121405751, + "grad_norm": 0.06117153540253639, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9019 + }, + { + "epoch": 7.204472843450479, + "grad_norm": 0.056790344417095184, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9020 + }, + { + "epoch": 7.205271565495208, + "grad_norm": 0.8241305351257324, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9021 + }, + { + "epoch": 7.206070287539936, + "grad_norm": 0.21823863685131073, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9022 + }, + { + "epoch": 7.206869009584665, + "grad_norm": 0.14799124002456665, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9023 + }, + { + "epoch": 7.207667731629393, + "grad_norm": 0.09815513342618942, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9024 + }, + { + "epoch": 7.208466453674121, + "grad_norm": 0.2076011300086975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9025 + }, + { + "epoch": 7.2092651757188495, + "grad_norm": 0.13652865588665009, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9026 + }, + { + "epoch": 7.210063897763578, + "grad_norm": 0.15180739760398865, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9027 + }, + { + "epoch": 7.210862619808307, + "grad_norm": 0.11385779827833176, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9028 + }, + { + "epoch": 7.211661341853035, + "grad_norm": 0.05047432705760002, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9029 + }, + { + "epoch": 7.212460063897764, + "grad_norm": 0.13789398968219757, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9030 + }, + { + "epoch": 7.213258785942492, + "grad_norm": 0.10509981215000153, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9031 + }, + { + "epoch": 7.214057507987221, + "grad_norm": 0.19650724530220032, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9032 + }, + { + "epoch": 7.214856230031949, + "grad_norm": 0.11788946390151978, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9033 + }, + { + "epoch": 7.215654952076678, + "grad_norm": 0.11023712903261185, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9034 + }, + { + "epoch": 7.216453674121405, + "grad_norm": 0.3382134735584259, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9035 + }, + { + "epoch": 7.217252396166134, + "grad_norm": 0.20465348660945892, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9036 + }, + { + "epoch": 7.218051118210862, + "grad_norm": 0.17456264793872833, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9037 + }, + { + "epoch": 7.218849840255591, + "grad_norm": 0.09034306555986404, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9038 + }, + { + "epoch": 7.2196485623003195, + "grad_norm": 0.15296493470668793, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9039 + }, + { + "epoch": 7.220447284345048, + "grad_norm": 0.1379650980234146, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9040 + }, + { + "epoch": 7.2212460063897765, + "grad_norm": 0.20932430028915405, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9041 + }, + { + "epoch": 7.222044728434505, + "grad_norm": 0.09309016168117523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9042 + }, + { + "epoch": 7.222843450479234, + "grad_norm": 0.13084891438484192, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9043 + }, + { + "epoch": 7.223642172523961, + "grad_norm": 0.1435803472995758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9044 + }, + { + "epoch": 7.22444089456869, + "grad_norm": 0.05868425592780113, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9045 + }, + { + "epoch": 7.225239616613418, + "grad_norm": 0.09483210742473602, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9046 + }, + { + "epoch": 7.226038338658147, + "grad_norm": 0.20051591098308563, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9047 + }, + { + "epoch": 7.226837060702875, + "grad_norm": 0.09253975749015808, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9048 + }, + { + "epoch": 7.227635782747604, + "grad_norm": 0.15865609049797058, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9049 + }, + { + "epoch": 7.228434504792332, + "grad_norm": 0.14421933889389038, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9050 + }, + { + "epoch": 7.229233226837061, + "grad_norm": 0.13492006063461304, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9051 + }, + { + "epoch": 7.2300319488817895, + "grad_norm": 0.06581155210733414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9052 + }, + { + "epoch": 7.230830670926518, + "grad_norm": 0.12610170245170593, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9053 + }, + { + "epoch": 7.231629392971246, + "grad_norm": 0.12813681364059448, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9054 + }, + { + "epoch": 7.232428115015974, + "grad_norm": 0.07228157669305801, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9055 + }, + { + "epoch": 7.233226837060703, + "grad_norm": 0.13456740975379944, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9056 + }, + { + "epoch": 7.234025559105431, + "grad_norm": 0.10491029918193817, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9057 + }, + { + "epoch": 7.23482428115016, + "grad_norm": 0.14090387523174286, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9058 + }, + { + "epoch": 7.235623003194888, + "grad_norm": 0.10722684115171432, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9059 + }, + { + "epoch": 7.236421725239617, + "grad_norm": 0.05123287811875343, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9060 + }, + { + "epoch": 7.237220447284345, + "grad_norm": 0.1203593909740448, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9061 + }, + { + "epoch": 7.238019169329074, + "grad_norm": 0.07847320288419724, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9062 + }, + { + "epoch": 7.2388178913738015, + "grad_norm": 0.09621457010507584, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9063 + }, + { + "epoch": 7.23961661341853, + "grad_norm": 0.11915068328380585, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9064 + }, + { + "epoch": 7.2404153354632586, + "grad_norm": 0.18357326090335846, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9065 + }, + { + "epoch": 7.241214057507987, + "grad_norm": 0.06862817704677582, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9066 + }, + { + "epoch": 7.242012779552716, + "grad_norm": 0.05091634392738342, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9067 + }, + { + "epoch": 7.242811501597444, + "grad_norm": 0.09132825583219528, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9068 + }, + { + "epoch": 7.243610223642173, + "grad_norm": 0.11998780816793442, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9069 + }, + { + "epoch": 7.244408945686901, + "grad_norm": 0.0678768903017044, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9070 + }, + { + "epoch": 7.24520766773163, + "grad_norm": 0.19880260527133942, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9071 + }, + { + "epoch": 7.246006389776358, + "grad_norm": 0.06379543989896774, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9072 + }, + { + "epoch": 7.246805111821086, + "grad_norm": 0.06652764976024628, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9073 + }, + { + "epoch": 7.247603833865814, + "grad_norm": 0.10495885461568832, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9074 + }, + { + "epoch": 7.248402555910543, + "grad_norm": 0.14753985404968262, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9075 + }, + { + "epoch": 7.2492012779552715, + "grad_norm": 0.08283182233572006, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9076 + }, + { + "epoch": 7.25, + "grad_norm": 0.1378672569990158, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9077 + }, + { + "epoch": 7.2507987220447285, + "grad_norm": 0.10274125635623932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9078 + }, + { + "epoch": 7.251597444089457, + "grad_norm": 0.09236814826726913, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9079 + }, + { + "epoch": 7.252396166134186, + "grad_norm": 0.07923156023025513, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9080 + }, + { + "epoch": 7.253194888178914, + "grad_norm": 0.2953792214393616, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9081 + }, + { + "epoch": 7.253993610223642, + "grad_norm": 9.043856620788574, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9082 + }, + { + "epoch": 7.25479233226837, + "grad_norm": 60.094329833984375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9083 + }, + { + "epoch": 7.255591054313099, + "grad_norm": 48.363075256347656, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 9084 + }, + { + "epoch": 7.256389776357827, + "grad_norm": 92.13807678222656, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9085 + }, + { + "epoch": 7.257188498402556, + "grad_norm": 71.66429138183594, + "learning_rate": 0.0005, + "loss": 1.1524, + "step": 9086 + }, + { + "epoch": 7.257987220447284, + "grad_norm": 29.742534637451172, + "learning_rate": 0.0005, + "loss": 1.2362, + "step": 9087 + }, + { + "epoch": 7.258785942492013, + "grad_norm": 1.1841496229171753, + "learning_rate": 0.0005, + "loss": 1.4452, + "step": 9088 + }, + { + "epoch": 7.2595846645367414, + "grad_norm": 0.7909824252128601, + "learning_rate": 0.0005, + "loss": 1.3049, + "step": 9089 + }, + { + "epoch": 7.26038338658147, + "grad_norm": 0.796114444732666, + "learning_rate": 0.0005, + "loss": 1.2852, + "step": 9090 + }, + { + "epoch": 7.261182108626198, + "grad_norm": 0.9014440178871155, + "learning_rate": 0.0005, + "loss": 1.2243, + "step": 9091 + }, + { + "epoch": 7.261980830670926, + "grad_norm": 0.5654944777488708, + "learning_rate": 0.0005, + "loss": 1.1462, + "step": 9092 + }, + { + "epoch": 7.262779552715655, + "grad_norm": 1.0784763097763062, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 9093 + }, + { + "epoch": 7.263578274760383, + "grad_norm": 0.9014595150947571, + "learning_rate": 0.0005, + "loss": 1.1629, + "step": 9094 + }, + { + "epoch": 7.264376996805112, + "grad_norm": 0.4847378730773926, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9095 + }, + { + "epoch": 7.26517571884984, + "grad_norm": 0.5493710041046143, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9096 + }, + { + "epoch": 7.265974440894569, + "grad_norm": 1.0691193342208862, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 9097 + }, + { + "epoch": 7.266773162939297, + "grad_norm": 2.062331199645996, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 9098 + }, + { + "epoch": 7.267571884984026, + "grad_norm": 2.778977632522583, + "learning_rate": 0.0005, + "loss": 1.2775, + "step": 9099 + }, + { + "epoch": 7.268370607028754, + "grad_norm": 0.8807574510574341, + "learning_rate": 0.0005, + "loss": 1.2851, + "step": 9100 + }, + { + "epoch": 7.269169329073483, + "grad_norm": 1.0370792150497437, + "learning_rate": 0.0005, + "loss": 1.1677, + "step": 9101 + }, + { + "epoch": 7.2699680511182105, + "grad_norm": 0.5272591710090637, + "learning_rate": 0.0005, + "loss": 1.1754, + "step": 9102 + }, + { + "epoch": 7.270766773162939, + "grad_norm": 0.5510113835334778, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 9103 + }, + { + "epoch": 7.271565495207668, + "grad_norm": 0.4650730490684509, + "learning_rate": 0.0005, + "loss": 1.1741, + "step": 9104 + }, + { + "epoch": 7.272364217252396, + "grad_norm": 1.071080207824707, + "learning_rate": 0.0005, + "loss": 1.1418, + "step": 9105 + }, + { + "epoch": 7.273162939297125, + "grad_norm": 0.32088524103164673, + "learning_rate": 0.0005, + "loss": 1.1304, + "step": 9106 + }, + { + "epoch": 7.273961661341853, + "grad_norm": 1.2110369205474854, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 9107 + }, + { + "epoch": 7.274760383386582, + "grad_norm": 0.8781233429908752, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 9108 + }, + { + "epoch": 7.27555910543131, + "grad_norm": 0.356841117143631, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9109 + }, + { + "epoch": 7.276357827476039, + "grad_norm": 0.41136255860328674, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 9110 + }, + { + "epoch": 7.277156549520766, + "grad_norm": 0.30638960003852844, + "learning_rate": 0.0005, + "loss": 1.1006, + "step": 9111 + }, + { + "epoch": 7.277955271565495, + "grad_norm": 0.3056134879589081, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 9112 + }, + { + "epoch": 7.2787539936102235, + "grad_norm": 0.3053964376449585, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 9113 + }, + { + "epoch": 7.279552715654952, + "grad_norm": 0.2799919843673706, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 9114 + }, + { + "epoch": 7.2803514376996805, + "grad_norm": 0.19091907143592834, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 9115 + }, + { + "epoch": 7.281150159744409, + "grad_norm": 0.19973579049110413, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 9116 + }, + { + "epoch": 7.281948881789138, + "grad_norm": 0.21867726743221283, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 9117 + }, + { + "epoch": 7.282747603833866, + "grad_norm": 0.10351689904928207, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 9118 + }, + { + "epoch": 7.283546325878595, + "grad_norm": 0.16956113278865814, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 9119 + }, + { + "epoch": 7.284345047923322, + "grad_norm": 0.2959003150463104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 9120 + }, + { + "epoch": 7.285143769968051, + "grad_norm": 0.18194587528705597, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 9121 + }, + { + "epoch": 7.285942492012779, + "grad_norm": 0.10713140666484833, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 9122 + }, + { + "epoch": 7.286741214057508, + "grad_norm": 0.2391309142112732, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9123 + }, + { + "epoch": 7.287539936102236, + "grad_norm": 0.25640085339546204, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 9124 + }, + { + "epoch": 7.288338658146965, + "grad_norm": 0.25697845220565796, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9125 + }, + { + "epoch": 7.289137380191693, + "grad_norm": 0.2679392695426941, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 9126 + }, + { + "epoch": 7.289936102236422, + "grad_norm": 0.3405737280845642, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9127 + }, + { + "epoch": 7.2907348242811505, + "grad_norm": 0.31081417202949524, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 9128 + }, + { + "epoch": 7.291533546325878, + "grad_norm": 0.15159590542316437, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 9129 + }, + { + "epoch": 7.292332268370607, + "grad_norm": 1.1609382629394531, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 9130 + }, + { + "epoch": 7.293130990415335, + "grad_norm": 0.5588571429252625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 9131 + }, + { + "epoch": 7.293929712460064, + "grad_norm": 0.47076234221458435, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9132 + }, + { + "epoch": 7.294728434504792, + "grad_norm": 1.184756875038147, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 9133 + }, + { + "epoch": 7.295527156549521, + "grad_norm": 0.40956422686576843, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9134 + }, + { + "epoch": 7.296325878594249, + "grad_norm": 0.8017024397850037, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 9135 + }, + { + "epoch": 7.297124600638978, + "grad_norm": 0.29993146657943726, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9136 + }, + { + "epoch": 7.297923322683706, + "grad_norm": 0.4549245238304138, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 9137 + }, + { + "epoch": 7.298722044728435, + "grad_norm": 0.26366063952445984, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 9138 + }, + { + "epoch": 7.2995207667731625, + "grad_norm": 0.3126361668109894, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 9139 + }, + { + "epoch": 7.300319488817891, + "grad_norm": 0.18184784054756165, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 9140 + }, + { + "epoch": 7.30111821086262, + "grad_norm": 0.91683429479599, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 9141 + }, + { + "epoch": 7.301916932907348, + "grad_norm": 3.3384642601013184, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9142 + }, + { + "epoch": 7.302715654952077, + "grad_norm": 0.21734145283699036, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 9143 + }, + { + "epoch": 7.303514376996805, + "grad_norm": 0.13850291073322296, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9144 + }, + { + "epoch": 7.304313099041534, + "grad_norm": 0.1737629920244217, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 9145 + }, + { + "epoch": 7.305111821086262, + "grad_norm": 0.3947316110134125, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 9146 + }, + { + "epoch": 7.305910543130991, + "grad_norm": 0.16360799968242645, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9147 + }, + { + "epoch": 7.306709265175719, + "grad_norm": 0.14816711843013763, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9148 + }, + { + "epoch": 7.307507987220447, + "grad_norm": 0.13554179668426514, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9149 + }, + { + "epoch": 7.3083067092651754, + "grad_norm": 0.10308978706598282, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9150 + }, + { + "epoch": 7.309105431309904, + "grad_norm": 0.11216582357883453, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 9151 + }, + { + "epoch": 7.3099041533546325, + "grad_norm": 0.08531700819730759, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9152 + }, + { + "epoch": 7.310702875399361, + "grad_norm": 0.10261841118335724, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9153 + }, + { + "epoch": 7.31150159744409, + "grad_norm": 0.18318074941635132, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9154 + }, + { + "epoch": 7.312300319488818, + "grad_norm": 0.1616939902305603, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9155 + }, + { + "epoch": 7.313099041533547, + "grad_norm": 0.10412739217281342, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9156 + }, + { + "epoch": 7.313897763578275, + "grad_norm": 0.14097075164318085, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9157 + }, + { + "epoch": 7.314696485623003, + "grad_norm": 0.2168329358100891, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 9158 + }, + { + "epoch": 7.315495207667731, + "grad_norm": 0.14337286353111267, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 9159 + }, + { + "epoch": 7.31629392971246, + "grad_norm": 0.10328586399555206, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9160 + }, + { + "epoch": 7.317092651757188, + "grad_norm": 0.15820610523223877, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9161 + }, + { + "epoch": 7.317891373801917, + "grad_norm": 0.11771009862422943, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9162 + }, + { + "epoch": 7.318690095846645, + "grad_norm": 0.06801208108663559, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9163 + }, + { + "epoch": 7.319488817891374, + "grad_norm": 0.08691044896841049, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9164 + }, + { + "epoch": 7.3202875399361025, + "grad_norm": 0.10149878263473511, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9165 + }, + { + "epoch": 7.321086261980831, + "grad_norm": 0.08544973284006119, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9166 + }, + { + "epoch": 7.321884984025559, + "grad_norm": 0.21312831342220306, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9167 + }, + { + "epoch": 7.322683706070287, + "grad_norm": 0.09866507351398468, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9168 + }, + { + "epoch": 7.323482428115016, + "grad_norm": 0.09676753729581833, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9169 + }, + { + "epoch": 7.324281150159744, + "grad_norm": 0.1783452033996582, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9170 + }, + { + "epoch": 7.325079872204473, + "grad_norm": 0.16399280726909637, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9171 + }, + { + "epoch": 7.325878594249201, + "grad_norm": 0.1160425990819931, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9172 + }, + { + "epoch": 7.32667731629393, + "grad_norm": 0.09826952964067459, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9173 + }, + { + "epoch": 7.327476038338658, + "grad_norm": 0.1292516440153122, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9174 + }, + { + "epoch": 7.328274760383387, + "grad_norm": 0.1253383606672287, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9175 + }, + { + "epoch": 7.329073482428115, + "grad_norm": 0.15330855548381805, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9176 + }, + { + "epoch": 7.329872204472843, + "grad_norm": 0.16339725255966187, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9177 + }, + { + "epoch": 7.330670926517572, + "grad_norm": 0.1716328263282776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9178 + }, + { + "epoch": 7.3314696485623, + "grad_norm": 0.07669667154550552, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9179 + }, + { + "epoch": 7.332268370607029, + "grad_norm": 0.06626272946596146, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 9180 + }, + { + "epoch": 7.333067092651757, + "grad_norm": 0.0935940146446228, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9181 + }, + { + "epoch": 7.333865814696486, + "grad_norm": 0.07840511202812195, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9182 + }, + { + "epoch": 7.334664536741214, + "grad_norm": 0.07776588946580887, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9183 + }, + { + "epoch": 7.335463258785943, + "grad_norm": 0.084624283015728, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9184 + }, + { + "epoch": 7.336261980830671, + "grad_norm": 0.07562167197465897, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9185 + }, + { + "epoch": 7.3370607028754, + "grad_norm": 0.08628194034099579, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9186 + }, + { + "epoch": 7.337859424920127, + "grad_norm": 0.0654950812458992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9187 + }, + { + "epoch": 7.338658146964856, + "grad_norm": 0.06403883546590805, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9188 + }, + { + "epoch": 7.3394568690095845, + "grad_norm": 0.8679103851318359, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9189 + }, + { + "epoch": 7.340255591054313, + "grad_norm": 0.42257770895957947, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 9190 + }, + { + "epoch": 7.3410543130990416, + "grad_norm": 0.3017493486404419, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 9191 + }, + { + "epoch": 7.34185303514377, + "grad_norm": 0.30509164929389954, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 9192 + }, + { + "epoch": 7.342651757188499, + "grad_norm": 0.28457221388816833, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 9193 + }, + { + "epoch": 7.343450479233227, + "grad_norm": 0.2734214961528778, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9194 + }, + { + "epoch": 7.344249201277956, + "grad_norm": 0.2931375801563263, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 9195 + }, + { + "epoch": 7.345047923322683, + "grad_norm": 0.11534975469112396, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9196 + }, + { + "epoch": 7.345846645367412, + "grad_norm": 0.1489555388689041, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 9197 + }, + { + "epoch": 7.34664536741214, + "grad_norm": 0.13024470210075378, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 9198 + }, + { + "epoch": 7.347444089456869, + "grad_norm": 0.1413331776857376, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9199 + }, + { + "epoch": 7.348242811501597, + "grad_norm": 0.07862340658903122, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9200 + }, + { + "epoch": 7.349041533546326, + "grad_norm": 0.0870542973279953, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9201 + }, + { + "epoch": 7.3498402555910545, + "grad_norm": 0.07556174695491791, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 9202 + }, + { + "epoch": 7.350638977635783, + "grad_norm": 0.07381146401166916, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9203 + }, + { + "epoch": 7.3514376996805115, + "grad_norm": 0.5006929636001587, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 9204 + }, + { + "epoch": 7.352236421725239, + "grad_norm": 0.2980809807777405, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 9205 + }, + { + "epoch": 7.353035143769968, + "grad_norm": 0.20632435381412506, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 9206 + }, + { + "epoch": 7.353833865814696, + "grad_norm": 0.2028435915708542, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9207 + }, + { + "epoch": 7.354632587859425, + "grad_norm": 0.220264732837677, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9208 + }, + { + "epoch": 7.355431309904153, + "grad_norm": 0.07175029814243317, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9209 + }, + { + "epoch": 7.356230031948882, + "grad_norm": 0.20052626729011536, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9210 + }, + { + "epoch": 7.35702875399361, + "grad_norm": 0.3549690544605255, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9211 + }, + { + "epoch": 7.357827476038339, + "grad_norm": 0.1310572475194931, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9212 + }, + { + "epoch": 7.358626198083067, + "grad_norm": 0.9551740288734436, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 9213 + }, + { + "epoch": 7.359424920127796, + "grad_norm": 0.13663409650325775, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9214 + }, + { + "epoch": 7.360223642172524, + "grad_norm": 0.11436715722084045, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9215 + }, + { + "epoch": 7.361022364217252, + "grad_norm": 0.10911283642053604, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 9216 + }, + { + "epoch": 7.361821086261981, + "grad_norm": 0.11186671257019043, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9217 + }, + { + "epoch": 7.362619808306709, + "grad_norm": 0.1308698207139969, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9218 + }, + { + "epoch": 7.363418530351438, + "grad_norm": 0.07584013044834137, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9219 + }, + { + "epoch": 7.364217252396166, + "grad_norm": 0.07789483666419983, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 9220 + }, + { + "epoch": 7.365015974440895, + "grad_norm": 0.12758736312389374, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 9221 + }, + { + "epoch": 7.365814696485623, + "grad_norm": 0.09310994297266006, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9222 + }, + { + "epoch": 7.366613418530352, + "grad_norm": 0.14761847257614136, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9223 + }, + { + "epoch": 7.36741214057508, + "grad_norm": 0.8784921169281006, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9224 + }, + { + "epoch": 7.368210862619808, + "grad_norm": 0.07754036784172058, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9225 + }, + { + "epoch": 7.3690095846645365, + "grad_norm": 0.06706640869379044, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9226 + }, + { + "epoch": 7.369808306709265, + "grad_norm": 0.0949360579252243, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9227 + }, + { + "epoch": 7.3706070287539935, + "grad_norm": 0.09635552763938904, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9228 + }, + { + "epoch": 7.371405750798722, + "grad_norm": 0.15888135135173798, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9229 + }, + { + "epoch": 7.372204472843451, + "grad_norm": 0.1487814337015152, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9230 + }, + { + "epoch": 7.373003194888179, + "grad_norm": 0.09755469113588333, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9231 + }, + { + "epoch": 7.373801916932908, + "grad_norm": 0.2550356984138489, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9232 + }, + { + "epoch": 7.374600638977636, + "grad_norm": 0.13796621561050415, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9233 + }, + { + "epoch": 7.375399361022364, + "grad_norm": 0.06727192550897598, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9234 + }, + { + "epoch": 7.376198083067092, + "grad_norm": 0.09111928194761276, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 9235 + }, + { + "epoch": 7.376996805111821, + "grad_norm": 0.15708492696285248, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9236 + }, + { + "epoch": 7.377795527156549, + "grad_norm": 0.06607159227132797, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9237 + }, + { + "epoch": 7.378594249201278, + "grad_norm": 0.3495469391345978, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9238 + }, + { + "epoch": 7.3793929712460065, + "grad_norm": 0.249598890542984, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9239 + }, + { + "epoch": 7.380191693290735, + "grad_norm": 0.1506706029176712, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9240 + }, + { + "epoch": 7.3809904153354635, + "grad_norm": 0.2053573578596115, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9241 + }, + { + "epoch": 7.381789137380192, + "grad_norm": 0.20234468579292297, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9242 + }, + { + "epoch": 7.38258785942492, + "grad_norm": 0.23514828085899353, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9243 + }, + { + "epoch": 7.383386581469648, + "grad_norm": 0.13418453931808472, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9244 + }, + { + "epoch": 7.384185303514377, + "grad_norm": 0.07703951746225357, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9245 + }, + { + "epoch": 7.384984025559105, + "grad_norm": 0.20256030559539795, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 9246 + }, + { + "epoch": 7.385782747603834, + "grad_norm": 0.1140165850520134, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9247 + }, + { + "epoch": 7.386581469648562, + "grad_norm": 0.6283542513847351, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9248 + }, + { + "epoch": 7.387380191693291, + "grad_norm": 0.11779789626598358, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9249 + }, + { + "epoch": 7.388178913738019, + "grad_norm": 0.09821031987667084, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9250 + }, + { + "epoch": 7.388977635782748, + "grad_norm": 0.10942906141281128, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9251 + }, + { + "epoch": 7.389776357827476, + "grad_norm": 0.6150240302085876, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9252 + }, + { + "epoch": 7.390575079872204, + "grad_norm": 0.17758208513259888, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9253 + }, + { + "epoch": 7.391373801916933, + "grad_norm": 0.09567593038082123, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9254 + }, + { + "epoch": 7.392172523961661, + "grad_norm": 0.1177724078297615, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9255 + }, + { + "epoch": 7.39297124600639, + "grad_norm": 0.12369771301746368, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9256 + }, + { + "epoch": 7.393769968051118, + "grad_norm": 0.11247415840625763, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9257 + }, + { + "epoch": 7.394568690095847, + "grad_norm": 0.15094342827796936, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9258 + }, + { + "epoch": 7.395367412140575, + "grad_norm": 0.113029845058918, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9259 + }, + { + "epoch": 7.396166134185304, + "grad_norm": 0.1620573252439499, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9260 + }, + { + "epoch": 7.396964856230032, + "grad_norm": 0.10010898113250732, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9261 + }, + { + "epoch": 7.397763578274761, + "grad_norm": 0.21061348915100098, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9262 + }, + { + "epoch": 7.3985623003194885, + "grad_norm": 0.06199006363749504, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9263 + }, + { + "epoch": 7.399361022364217, + "grad_norm": 0.09612002968788147, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9264 + }, + { + "epoch": 7.4001597444089455, + "grad_norm": 0.13255780935287476, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9265 + }, + { + "epoch": 7.400958466453674, + "grad_norm": 0.22877056896686554, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9266 + }, + { + "epoch": 7.401757188498403, + "grad_norm": 0.18957512080669403, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9267 + }, + { + "epoch": 7.402555910543131, + "grad_norm": 0.211961030960083, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9268 + }, + { + "epoch": 7.40335463258786, + "grad_norm": 0.07744339853525162, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9269 + }, + { + "epoch": 7.404153354632588, + "grad_norm": 0.19085711240768433, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9270 + }, + { + "epoch": 7.404952076677317, + "grad_norm": 0.13099227845668793, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9271 + }, + { + "epoch": 7.405750798722044, + "grad_norm": 0.24543818831443787, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9272 + }, + { + "epoch": 7.406549520766773, + "grad_norm": 0.18623757362365723, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9273 + }, + { + "epoch": 7.407348242811501, + "grad_norm": 0.06898430734872818, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9274 + }, + { + "epoch": 7.40814696485623, + "grad_norm": 0.1809006780385971, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9275 + }, + { + "epoch": 7.4089456869009584, + "grad_norm": 0.11338596791028976, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9276 + }, + { + "epoch": 7.409744408945687, + "grad_norm": 0.10182031989097595, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9277 + }, + { + "epoch": 7.4105431309904155, + "grad_norm": 0.1521865278482437, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9278 + }, + { + "epoch": 7.411341853035144, + "grad_norm": 0.08848808705806732, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9279 + }, + { + "epoch": 7.412140575079873, + "grad_norm": 0.10398431867361069, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9280 + }, + { + "epoch": 7.4129392971246, + "grad_norm": 0.10145912319421768, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9281 + }, + { + "epoch": 7.413738019169329, + "grad_norm": 0.12386789917945862, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9282 + }, + { + "epoch": 7.414536741214057, + "grad_norm": 0.09763981401920319, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9283 + }, + { + "epoch": 7.415335463258786, + "grad_norm": 0.08810468763113022, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9284 + }, + { + "epoch": 7.416134185303514, + "grad_norm": 0.06196752190589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9285 + }, + { + "epoch": 7.416932907348243, + "grad_norm": 1.4297560453414917, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9286 + }, + { + "epoch": 7.417731629392971, + "grad_norm": 0.07783587276935577, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9287 + }, + { + "epoch": 7.4185303514377, + "grad_norm": 0.3592485189437866, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9288 + }, + { + "epoch": 7.419329073482428, + "grad_norm": 0.10796934366226196, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9289 + }, + { + "epoch": 7.420127795527157, + "grad_norm": 0.11450864374637604, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9290 + }, + { + "epoch": 7.420926517571885, + "grad_norm": 0.06718776375055313, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9291 + }, + { + "epoch": 7.421725239616613, + "grad_norm": 0.1776629537343979, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 9292 + }, + { + "epoch": 7.422523961661342, + "grad_norm": 0.058177318423986435, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9293 + }, + { + "epoch": 7.42332268370607, + "grad_norm": 0.08145572990179062, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9294 + }, + { + "epoch": 7.424121405750799, + "grad_norm": 0.07605774700641632, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9295 + }, + { + "epoch": 7.424920127795527, + "grad_norm": 0.5453565120697021, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9296 + }, + { + "epoch": 7.425718849840256, + "grad_norm": 0.08215200155973434, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9297 + }, + { + "epoch": 7.426517571884984, + "grad_norm": 0.06014016270637512, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9298 + }, + { + "epoch": 7.427316293929713, + "grad_norm": 0.11043576151132584, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9299 + }, + { + "epoch": 7.428115015974441, + "grad_norm": 0.1421220898628235, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9300 + }, + { + "epoch": 7.428913738019169, + "grad_norm": 0.10473544150590897, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9301 + }, + { + "epoch": 7.4297124600638975, + "grad_norm": 0.09921323508024216, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9302 + }, + { + "epoch": 7.430511182108626, + "grad_norm": 0.07775744050741196, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9303 + }, + { + "epoch": 7.431309904153355, + "grad_norm": 0.3015517294406891, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9304 + }, + { + "epoch": 7.432108626198083, + "grad_norm": 0.06826018542051315, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9305 + }, + { + "epoch": 7.432907348242812, + "grad_norm": 0.06002574786543846, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9306 + }, + { + "epoch": 7.43370607028754, + "grad_norm": 0.07082310318946838, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9307 + }, + { + "epoch": 7.434504792332269, + "grad_norm": 0.1356203258037567, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9308 + }, + { + "epoch": 7.435303514376997, + "grad_norm": 0.09689080715179443, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9309 + }, + { + "epoch": 7.436102236421725, + "grad_norm": 0.0938429981470108, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9310 + }, + { + "epoch": 7.436900958466453, + "grad_norm": 0.0853746086359024, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9311 + }, + { + "epoch": 7.437699680511182, + "grad_norm": 0.09427982568740845, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9312 + }, + { + "epoch": 7.43849840255591, + "grad_norm": 0.14042942225933075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9313 + }, + { + "epoch": 7.439297124600639, + "grad_norm": 0.4248291552066803, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9314 + }, + { + "epoch": 7.4400958466453675, + "grad_norm": 0.18214350938796997, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9315 + }, + { + "epoch": 7.440894568690096, + "grad_norm": 0.2564402222633362, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 9316 + }, + { + "epoch": 7.4416932907348246, + "grad_norm": 0.10012423992156982, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9317 + }, + { + "epoch": 7.442492012779553, + "grad_norm": 0.15337461233139038, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 9318 + }, + { + "epoch": 7.443290734824281, + "grad_norm": 0.1396649181842804, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9319 + }, + { + "epoch": 7.444089456869009, + "grad_norm": 0.12310001254081726, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9320 + }, + { + "epoch": 7.444888178913738, + "grad_norm": 0.12932278215885162, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9321 + }, + { + "epoch": 7.445686900958466, + "grad_norm": 0.12403959035873413, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9322 + }, + { + "epoch": 7.446485623003195, + "grad_norm": 0.4164578318595886, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9323 + }, + { + "epoch": 7.447284345047923, + "grad_norm": 0.2015235871076584, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 9324 + }, + { + "epoch": 7.448083067092652, + "grad_norm": 0.2619101107120514, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9325 + }, + { + "epoch": 7.44888178913738, + "grad_norm": 0.07511210441589355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9326 + }, + { + "epoch": 7.449680511182109, + "grad_norm": 7.956277370452881, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9327 + }, + { + "epoch": 7.4504792332268375, + "grad_norm": 0.23822273313999176, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 9328 + }, + { + "epoch": 7.451277955271565, + "grad_norm": 0.1565391719341278, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 9329 + }, + { + "epoch": 7.452076677316294, + "grad_norm": 0.15820777416229248, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 9330 + }, + { + "epoch": 7.452875399361022, + "grad_norm": 0.16341058909893036, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 9331 + }, + { + "epoch": 7.453674121405751, + "grad_norm": 0.19414658844470978, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 9332 + }, + { + "epoch": 7.454472843450479, + "grad_norm": 0.18798880279064178, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9333 + }, + { + "epoch": 7.455271565495208, + "grad_norm": 0.09032963961362839, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 9334 + }, + { + "epoch": 7.456070287539936, + "grad_norm": 0.12746790051460266, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9335 + }, + { + "epoch": 7.456869009584665, + "grad_norm": 0.34985360503196716, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9336 + }, + { + "epoch": 7.457667731629393, + "grad_norm": 0.22745627164840698, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 9337 + }, + { + "epoch": 7.458466453674122, + "grad_norm": 1.297531247138977, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9338 + }, + { + "epoch": 7.4592651757188495, + "grad_norm": 0.3254985809326172, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9339 + }, + { + "epoch": 7.460063897763578, + "grad_norm": 0.28899863362312317, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9340 + }, + { + "epoch": 7.460862619808307, + "grad_norm": 0.09964017570018768, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9341 + }, + { + "epoch": 7.461661341853035, + "grad_norm": 0.2713227868080139, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9342 + }, + { + "epoch": 7.462460063897764, + "grad_norm": 0.16604198515415192, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 9343 + }, + { + "epoch": 7.463258785942492, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 9344 + }, + { + "epoch": 7.464057507987221, + "grad_norm": 0.20081757009029388, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9345 + }, + { + "epoch": 7.464856230031949, + "grad_norm": 0.14005789160728455, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 9346 + }, + { + "epoch": 7.465654952076678, + "grad_norm": 0.15481705963611603, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9347 + }, + { + "epoch": 7.466453674121405, + "grad_norm": 0.1843721717596054, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9348 + }, + { + "epoch": 7.467252396166134, + "grad_norm": 0.11873828619718552, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 9349 + }, + { + "epoch": 7.468051118210862, + "grad_norm": 0.199008509516716, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9350 + }, + { + "epoch": 7.468849840255591, + "grad_norm": 0.10533998161554337, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9351 + }, + { + "epoch": 7.4696485623003195, + "grad_norm": 0.4823262691497803, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 9352 + }, + { + "epoch": 7.470447284345048, + "grad_norm": 0.25044289231300354, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9353 + }, + { + "epoch": 7.4712460063897765, + "grad_norm": 0.11273030936717987, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9354 + }, + { + "epoch": 7.472044728434505, + "grad_norm": 0.15552200376987457, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9355 + }, + { + "epoch": 7.472843450479234, + "grad_norm": 0.2211492508649826, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 9356 + }, + { + "epoch": 7.473642172523961, + "grad_norm": 0.38023853302001953, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9357 + }, + { + "epoch": 7.47444089456869, + "grad_norm": 0.15553027391433716, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9358 + }, + { + "epoch": 7.475239616613418, + "grad_norm": 0.11964324861764908, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9359 + }, + { + "epoch": 7.476038338658147, + "grad_norm": 0.06454652547836304, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9360 + }, + { + "epoch": 7.476837060702875, + "grad_norm": 0.090255506336689, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9361 + }, + { + "epoch": 7.477635782747604, + "grad_norm": 0.07100088149309158, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9362 + }, + { + "epoch": 7.478434504792332, + "grad_norm": 0.14697550237178802, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9363 + }, + { + "epoch": 7.479233226837061, + "grad_norm": 0.14088693261146545, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9364 + }, + { + "epoch": 7.4800319488817895, + "grad_norm": 0.12696029245853424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9365 + }, + { + "epoch": 7.480830670926517, + "grad_norm": 0.15335378050804138, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 9366 + }, + { + "epoch": 7.481629392971246, + "grad_norm": 0.10186830163002014, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9367 + }, + { + "epoch": 7.482428115015974, + "grad_norm": 0.11318683624267578, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 9368 + }, + { + "epoch": 7.483226837060703, + "grad_norm": 0.1290084272623062, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9369 + }, + { + "epoch": 7.484025559105431, + "grad_norm": 0.160775288939476, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 9370 + }, + { + "epoch": 7.48482428115016, + "grad_norm": 0.1998366117477417, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9371 + }, + { + "epoch": 7.485623003194888, + "grad_norm": 0.15808500349521637, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9372 + }, + { + "epoch": 7.486421725239617, + "grad_norm": 0.15403985977172852, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9373 + }, + { + "epoch": 7.487220447284345, + "grad_norm": 0.11963094770908356, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9374 + }, + { + "epoch": 7.488019169329074, + "grad_norm": 0.058245617896318436, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9375 + }, + { + "epoch": 7.488817891373802, + "grad_norm": 0.1256275773048401, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9376 + }, + { + "epoch": 7.48961661341853, + "grad_norm": 0.09230747818946838, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9377 + }, + { + "epoch": 7.4904153354632586, + "grad_norm": 0.15109197795391083, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9378 + }, + { + "epoch": 7.491214057507987, + "grad_norm": 0.20005039870738983, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9379 + }, + { + "epoch": 7.492012779552716, + "grad_norm": 0.08591387420892715, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9380 + }, + { + "epoch": 7.492811501597444, + "grad_norm": 0.07975071668624878, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9381 + }, + { + "epoch": 7.493610223642173, + "grad_norm": 0.1258707046508789, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9382 + }, + { + "epoch": 7.494408945686901, + "grad_norm": 0.16978499293327332, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9383 + }, + { + "epoch": 7.49520766773163, + "grad_norm": 0.09052985906600952, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9384 + }, + { + "epoch": 7.496006389776358, + "grad_norm": 0.15344351530075073, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9385 + }, + { + "epoch": 7.496805111821086, + "grad_norm": 0.04684900864958763, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9386 + }, + { + "epoch": 7.497603833865814, + "grad_norm": 0.09235356748104095, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9387 + }, + { + "epoch": 7.498402555910543, + "grad_norm": 0.0924983024597168, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9388 + }, + { + "epoch": 7.4992012779552715, + "grad_norm": 0.12623359262943268, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9389 + }, + { + "epoch": 7.5, + "grad_norm": 0.08572034537792206, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9390 + }, + { + "epoch": 7.5007987220447285, + "grad_norm": 0.12267094850540161, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9391 + }, + { + "epoch": 7.501597444089457, + "grad_norm": 0.20448675751686096, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9392 + }, + { + "epoch": 7.502396166134186, + "grad_norm": 0.21579930186271667, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9393 + }, + { + "epoch": 7.503194888178914, + "grad_norm": 0.22682903707027435, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9394 + }, + { + "epoch": 7.503993610223642, + "grad_norm": 0.08659582585096359, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 9395 + }, + { + "epoch": 7.50479233226837, + "grad_norm": 0.2064916491508484, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9396 + }, + { + "epoch": 7.505591054313099, + "grad_norm": 0.2137736678123474, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9397 + }, + { + "epoch": 7.506389776357827, + "grad_norm": 0.10891635715961456, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9398 + }, + { + "epoch": 7.507188498402556, + "grad_norm": 0.23018239438533783, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9399 + }, + { + "epoch": 7.507987220447284, + "grad_norm": 0.2091149538755417, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9400 + }, + { + "epoch": 7.508785942492013, + "grad_norm": 0.11136184632778168, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9401 + }, + { + "epoch": 7.5095846645367414, + "grad_norm": 0.1327456831932068, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9402 + }, + { + "epoch": 7.51038338658147, + "grad_norm": 0.08780363947153091, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9403 + }, + { + "epoch": 7.511182108626198, + "grad_norm": 0.14448396861553192, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9404 + }, + { + "epoch": 7.511980830670926, + "grad_norm": 0.12194132804870605, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9405 + }, + { + "epoch": 7.512779552715655, + "grad_norm": 0.09898994117975235, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9406 + }, + { + "epoch": 7.513578274760383, + "grad_norm": 0.0753403753042221, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9407 + }, + { + "epoch": 7.514376996805112, + "grad_norm": 0.1947120577096939, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9408 + }, + { + "epoch": 7.51517571884984, + "grad_norm": 0.10827653110027313, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9409 + }, + { + "epoch": 7.515974440894569, + "grad_norm": 0.06353825330734253, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9410 + }, + { + "epoch": 7.516773162939297, + "grad_norm": 0.16961680352687836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9411 + }, + { + "epoch": 7.517571884984026, + "grad_norm": 0.09001661092042923, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9412 + }, + { + "epoch": 7.518370607028754, + "grad_norm": 0.07342718541622162, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9413 + }, + { + "epoch": 7.519169329073483, + "grad_norm": 0.1001489907503128, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9414 + }, + { + "epoch": 7.5199680511182105, + "grad_norm": 0.10038813948631287, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9415 + }, + { + "epoch": 7.520766773162939, + "grad_norm": 0.17261064052581787, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9416 + }, + { + "epoch": 7.521565495207668, + "grad_norm": 0.10589580982923508, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9417 + }, + { + "epoch": 7.522364217252396, + "grad_norm": 0.055702172219753265, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9418 + }, + { + "epoch": 7.523162939297125, + "grad_norm": 0.122915118932724, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9419 + }, + { + "epoch": 7.523961661341853, + "grad_norm": 0.07361354678869247, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9420 + }, + { + "epoch": 7.524760383386582, + "grad_norm": 0.11187693476676941, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9421 + }, + { + "epoch": 7.52555910543131, + "grad_norm": 0.06205413118004799, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9422 + }, + { + "epoch": 7.526357827476039, + "grad_norm": 0.07805868983268738, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9423 + }, + { + "epoch": 7.527156549520766, + "grad_norm": 0.14349821209907532, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9424 + }, + { + "epoch": 7.527955271565495, + "grad_norm": 0.08928489685058594, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9425 + }, + { + "epoch": 7.5287539936102235, + "grad_norm": 0.10026145726442337, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 9426 + }, + { + "epoch": 7.529552715654952, + "grad_norm": 0.10531286150217056, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9427 + }, + { + "epoch": 7.5303514376996805, + "grad_norm": 0.15984703600406647, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9428 + }, + { + "epoch": 7.531150159744409, + "grad_norm": 0.2948785126209259, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9429 + }, + { + "epoch": 7.531948881789138, + "grad_norm": 0.08823632448911667, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9430 + }, + { + "epoch": 7.532747603833866, + "grad_norm": 0.23016497492790222, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9431 + }, + { + "epoch": 7.533546325878595, + "grad_norm": 0.08874809741973877, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9432 + }, + { + "epoch": 7.534345047923322, + "grad_norm": 0.09074181318283081, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9433 + }, + { + "epoch": 7.535143769968051, + "grad_norm": 0.15151673555374146, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9434 + }, + { + "epoch": 7.535942492012779, + "grad_norm": 0.12276771664619446, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9435 + }, + { + "epoch": 7.536741214057508, + "grad_norm": 0.13978977501392365, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9436 + }, + { + "epoch": 7.537539936102236, + "grad_norm": 0.16208869218826294, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 9437 + }, + { + "epoch": 7.538338658146965, + "grad_norm": 0.16932648420333862, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9438 + }, + { + "epoch": 7.539137380191693, + "grad_norm": 0.09139750897884369, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9439 + }, + { + "epoch": 7.539936102236422, + "grad_norm": 0.11264985054731369, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9440 + }, + { + "epoch": 7.5407348242811505, + "grad_norm": 0.13534623384475708, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9441 + }, + { + "epoch": 7.541533546325878, + "grad_norm": 0.16307172179222107, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9442 + }, + { + "epoch": 7.542332268370607, + "grad_norm": 0.09774577617645264, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9443 + }, + { + "epoch": 7.543130990415335, + "grad_norm": 0.1296136975288391, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9444 + }, + { + "epoch": 7.543929712460064, + "grad_norm": 0.08055619895458221, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9445 + }, + { + "epoch": 7.544728434504792, + "grad_norm": 0.2668273448944092, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9446 + }, + { + "epoch": 7.545527156549521, + "grad_norm": 0.1507730782032013, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9447 + }, + { + "epoch": 7.546325878594249, + "grad_norm": 0.17098994553089142, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9448 + }, + { + "epoch": 7.547124600638978, + "grad_norm": 0.22425173223018646, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9449 + }, + { + "epoch": 7.547923322683706, + "grad_norm": 0.3074493706226349, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9450 + }, + { + "epoch": 7.548722044728435, + "grad_norm": 0.1917268931865692, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9451 + }, + { + "epoch": 7.549520766773163, + "grad_norm": 0.21276478469371796, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9452 + }, + { + "epoch": 7.550319488817891, + "grad_norm": 0.2990981638431549, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9453 + }, + { + "epoch": 7.55111821086262, + "grad_norm": 0.21135985851287842, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9454 + }, + { + "epoch": 7.551916932907348, + "grad_norm": 0.1154661774635315, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9455 + }, + { + "epoch": 7.552715654952077, + "grad_norm": 0.13149744272232056, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9456 + }, + { + "epoch": 7.553514376996805, + "grad_norm": 0.36513134837150574, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9457 + }, + { + "epoch": 7.554313099041534, + "grad_norm": 0.2005227655172348, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9458 + }, + { + "epoch": 7.555111821086262, + "grad_norm": 0.22272491455078125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9459 + }, + { + "epoch": 7.555910543130991, + "grad_norm": 0.05990196391940117, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9460 + }, + { + "epoch": 7.556709265175719, + "grad_norm": 0.20874981582164764, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9461 + }, + { + "epoch": 7.557507987220447, + "grad_norm": 0.10478242486715317, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9462 + }, + { + "epoch": 7.5583067092651754, + "grad_norm": 0.2455470710992813, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9463 + }, + { + "epoch": 7.559105431309904, + "grad_norm": 0.31378838419914246, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9464 + }, + { + "epoch": 7.5599041533546325, + "grad_norm": 0.1903901994228363, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9465 + }, + { + "epoch": 7.560702875399361, + "grad_norm": 0.34334853291511536, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9466 + }, + { + "epoch": 7.56150159744409, + "grad_norm": 0.20050539076328278, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9467 + }, + { + "epoch": 7.562300319488818, + "grad_norm": 0.14147023856639862, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9468 + }, + { + "epoch": 7.563099041533547, + "grad_norm": 0.2242746353149414, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9469 + }, + { + "epoch": 7.563897763578275, + "grad_norm": 0.10040932893753052, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9470 + }, + { + "epoch": 7.564696485623003, + "grad_norm": 0.2527815103530884, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9471 + }, + { + "epoch": 7.565495207667731, + "grad_norm": 0.1675105094909668, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9472 + }, + { + "epoch": 7.56629392971246, + "grad_norm": 0.23818080127239227, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9473 + }, + { + "epoch": 7.567092651757188, + "grad_norm": 0.31956857442855835, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 9474 + }, + { + "epoch": 7.567891373801917, + "grad_norm": 0.15272031724452972, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9475 + }, + { + "epoch": 7.568690095846645, + "grad_norm": 0.20540206134319305, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9476 + }, + { + "epoch": 7.569488817891374, + "grad_norm": 0.2269754856824875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9477 + }, + { + "epoch": 7.5702875399361025, + "grad_norm": 0.19880101084709167, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9478 + }, + { + "epoch": 7.571086261980831, + "grad_norm": 0.2734098732471466, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9479 + }, + { + "epoch": 7.571884984025559, + "grad_norm": 0.17886638641357422, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9480 + }, + { + "epoch": 7.572683706070287, + "grad_norm": 0.15882767736911774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9481 + }, + { + "epoch": 7.573482428115016, + "grad_norm": 0.18066628277301788, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9482 + }, + { + "epoch": 7.574281150159744, + "grad_norm": 0.1025780662894249, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9483 + }, + { + "epoch": 7.575079872204473, + "grad_norm": 0.09417031705379486, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9484 + }, + { + "epoch": 7.575878594249201, + "grad_norm": 0.26811933517456055, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9485 + }, + { + "epoch": 7.57667731629393, + "grad_norm": 0.07128968089818954, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9486 + }, + { + "epoch": 7.577476038338658, + "grad_norm": 0.13026759028434753, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9487 + }, + { + "epoch": 7.578274760383387, + "grad_norm": 0.09879457950592041, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 9488 + }, + { + "epoch": 7.5790734824281145, + "grad_norm": 0.15383538603782654, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9489 + }, + { + "epoch": 7.579872204472844, + "grad_norm": 0.17010194063186646, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 9490 + }, + { + "epoch": 7.580670926517572, + "grad_norm": 0.09413834661245346, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9491 + }, + { + "epoch": 7.5814696485623, + "grad_norm": 0.13111010193824768, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9492 + }, + { + "epoch": 7.582268370607029, + "grad_norm": 0.14170758426189423, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9493 + }, + { + "epoch": 7.583067092651757, + "grad_norm": 0.10549119114875793, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9494 + }, + { + "epoch": 7.583865814696486, + "grad_norm": 0.06767291575670242, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9495 + }, + { + "epoch": 7.584664536741214, + "grad_norm": 0.3329547643661499, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9496 + }, + { + "epoch": 7.585463258785943, + "grad_norm": 0.09325312823057175, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9497 + }, + { + "epoch": 7.586261980830671, + "grad_norm": 0.11408714950084686, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 9498 + }, + { + "epoch": 7.5870607028754, + "grad_norm": 0.10127131640911102, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9499 + }, + { + "epoch": 7.587859424920127, + "grad_norm": 0.14656123518943787, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9500 + }, + { + "epoch": 7.588658146964856, + "grad_norm": 0.33641156554222107, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9501 + }, + { + "epoch": 7.5894568690095845, + "grad_norm": 0.09339869022369385, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9502 + }, + { + "epoch": 7.590255591054313, + "grad_norm": 0.10584868490695953, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9503 + }, + { + "epoch": 7.5910543130990416, + "grad_norm": 0.09518138319253922, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9504 + }, + { + "epoch": 7.59185303514377, + "grad_norm": 0.07680968940258026, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9505 + }, + { + "epoch": 7.592651757188499, + "grad_norm": 0.19037210941314697, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9506 + }, + { + "epoch": 7.593450479233227, + "grad_norm": 0.06012401729822159, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9507 + }, + { + "epoch": 7.594249201277956, + "grad_norm": 0.08509133756160736, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9508 + }, + { + "epoch": 7.595047923322683, + "grad_norm": 0.3013906478881836, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9509 + }, + { + "epoch": 7.595846645367412, + "grad_norm": 0.19873085618019104, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9510 + }, + { + "epoch": 7.59664536741214, + "grad_norm": 0.16749307513237, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9511 + }, + { + "epoch": 7.597444089456869, + "grad_norm": 0.18683338165283203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9512 + }, + { + "epoch": 7.598242811501597, + "grad_norm": 0.16748754680156708, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9513 + }, + { + "epoch": 7.599041533546326, + "grad_norm": 0.11243029683828354, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9514 + }, + { + "epoch": 7.5998402555910545, + "grad_norm": 0.08024061471223831, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9515 + }, + { + "epoch": 7.600638977635783, + "grad_norm": 0.20173156261444092, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9516 + }, + { + "epoch": 7.6014376996805115, + "grad_norm": 0.09208648651838303, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9517 + }, + { + "epoch": 7.602236421725239, + "grad_norm": 0.14459539949893951, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9518 + }, + { + "epoch": 7.603035143769968, + "grad_norm": 0.12492551654577255, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9519 + }, + { + "epoch": 7.603833865814696, + "grad_norm": 0.0866885557770729, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9520 + }, + { + "epoch": 7.604632587859425, + "grad_norm": 0.18499130010604858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 9521 + }, + { + "epoch": 7.605431309904153, + "grad_norm": 0.15807269513607025, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9522 + }, + { + "epoch": 7.606230031948882, + "grad_norm": 0.0766368880867958, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9523 + }, + { + "epoch": 7.60702875399361, + "grad_norm": 0.14770840108394623, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9524 + }, + { + "epoch": 7.607827476038339, + "grad_norm": 0.14121781289577484, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9525 + }, + { + "epoch": 7.608626198083067, + "grad_norm": 0.061000190675258636, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9526 + }, + { + "epoch": 7.609424920127795, + "grad_norm": 0.10725796967744827, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9527 + }, + { + "epoch": 7.6102236421725244, + "grad_norm": 0.4449822008609772, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9528 + }, + { + "epoch": 7.611022364217252, + "grad_norm": 0.4324694275856018, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9529 + }, + { + "epoch": 7.611821086261981, + "grad_norm": 0.2345885932445526, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9530 + }, + { + "epoch": 7.612619808306709, + "grad_norm": 0.38030850887298584, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9531 + }, + { + "epoch": 7.613418530351438, + "grad_norm": 0.28466367721557617, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9532 + }, + { + "epoch": 7.614217252396166, + "grad_norm": 0.2688463032245636, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9533 + }, + { + "epoch": 7.615015974440895, + "grad_norm": 0.3490566313266754, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9534 + }, + { + "epoch": 7.615814696485623, + "grad_norm": 0.17181244492530823, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9535 + }, + { + "epoch": 7.616613418530352, + "grad_norm": 0.2932468056678772, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9536 + }, + { + "epoch": 7.61741214057508, + "grad_norm": 0.07963605225086212, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9537 + }, + { + "epoch": 7.618210862619808, + "grad_norm": 0.3166755437850952, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9538 + }, + { + "epoch": 7.6190095846645365, + "grad_norm": 0.13043160736560822, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9539 + }, + { + "epoch": 7.619808306709265, + "grad_norm": 0.22799645364284515, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9540 + }, + { + "epoch": 7.6206070287539935, + "grad_norm": 0.13454940915107727, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9541 + }, + { + "epoch": 7.621405750798722, + "grad_norm": 0.25270769000053406, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9542 + }, + { + "epoch": 7.622204472843451, + "grad_norm": 0.07556870579719543, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9543 + }, + { + "epoch": 7.623003194888179, + "grad_norm": 0.4405477046966553, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9544 + }, + { + "epoch": 7.623801916932908, + "grad_norm": 0.13088728487491608, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9545 + }, + { + "epoch": 7.624600638977636, + "grad_norm": 0.22698643803596497, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9546 + }, + { + "epoch": 7.625399361022364, + "grad_norm": 0.16014008224010468, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9547 + }, + { + "epoch": 7.626198083067092, + "grad_norm": 0.12253693491220474, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9548 + }, + { + "epoch": 7.626996805111821, + "grad_norm": 0.19500789046287537, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9549 + }, + { + "epoch": 7.627795527156549, + "grad_norm": 0.049878381192684174, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9550 + }, + { + "epoch": 7.628594249201278, + "grad_norm": 0.15125001966953278, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9551 + }, + { + "epoch": 7.6293929712460065, + "grad_norm": 0.6651006937026978, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9552 + }, + { + "epoch": 7.630191693290735, + "grad_norm": 0.12418173998594284, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9553 + }, + { + "epoch": 7.6309904153354635, + "grad_norm": 0.0924144983291626, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9554 + }, + { + "epoch": 7.631789137380192, + "grad_norm": 0.16454379260540009, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9555 + }, + { + "epoch": 7.63258785942492, + "grad_norm": 0.23841862380504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9556 + }, + { + "epoch": 7.633386581469648, + "grad_norm": 0.11875062435865402, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9557 + }, + { + "epoch": 7.634185303514377, + "grad_norm": 0.16778984665870667, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9558 + }, + { + "epoch": 7.634984025559105, + "grad_norm": 0.12286275625228882, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9559 + }, + { + "epoch": 7.635782747603834, + "grad_norm": 0.11795859038829803, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9560 + }, + { + "epoch": 7.636581469648562, + "grad_norm": 0.10615531355142593, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9561 + }, + { + "epoch": 7.637380191693291, + "grad_norm": 0.1273939311504364, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9562 + }, + { + "epoch": 7.638178913738019, + "grad_norm": 0.0739448294043541, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9563 + }, + { + "epoch": 7.638977635782748, + "grad_norm": 0.3214738965034485, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9564 + }, + { + "epoch": 7.6397763578274756, + "grad_norm": 0.13925962150096893, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9565 + }, + { + "epoch": 7.640575079872205, + "grad_norm": 0.07356422394514084, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9566 + }, + { + "epoch": 7.641373801916933, + "grad_norm": 0.0708729475736618, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9567 + }, + { + "epoch": 7.642172523961661, + "grad_norm": 0.08209198713302612, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9568 + }, + { + "epoch": 7.64297124600639, + "grad_norm": 0.08787291496992111, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9569 + }, + { + "epoch": 7.643769968051118, + "grad_norm": 0.16093257069587708, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9570 + }, + { + "epoch": 7.644568690095847, + "grad_norm": 0.17313137650489807, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9571 + }, + { + "epoch": 7.645367412140575, + "grad_norm": 0.09015117585659027, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9572 + }, + { + "epoch": 7.646166134185304, + "grad_norm": 0.06029650941491127, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9573 + }, + { + "epoch": 7.646964856230032, + "grad_norm": 0.15379463136196136, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9574 + }, + { + "epoch": 7.647763578274761, + "grad_norm": 0.11657056212425232, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9575 + }, + { + "epoch": 7.6485623003194885, + "grad_norm": 0.04901152476668358, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9576 + }, + { + "epoch": 7.649361022364217, + "grad_norm": 0.1282874494791031, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9577 + }, + { + "epoch": 7.6501597444089455, + "grad_norm": 0.10117272287607193, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9578 + }, + { + "epoch": 7.650958466453674, + "grad_norm": 0.20984217524528503, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9579 + }, + { + "epoch": 7.651757188498403, + "grad_norm": 0.11340182274580002, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9580 + }, + { + "epoch": 7.652555910543131, + "grad_norm": 0.04095076769590378, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9581 + }, + { + "epoch": 7.65335463258786, + "grad_norm": 0.1021147221326828, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9582 + }, + { + "epoch": 7.654153354632588, + "grad_norm": 0.13590390980243683, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9583 + }, + { + "epoch": 7.654952076677317, + "grad_norm": 0.11193613708019257, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 9584 + }, + { + "epoch": 7.655750798722044, + "grad_norm": 0.12928733229637146, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9585 + }, + { + "epoch": 7.656549520766773, + "grad_norm": 0.2061600685119629, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9586 + }, + { + "epoch": 7.657348242811501, + "grad_norm": 0.24827928841114044, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9587 + }, + { + "epoch": 7.65814696485623, + "grad_norm": 0.11018779128789902, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9588 + }, + { + "epoch": 7.6589456869009584, + "grad_norm": 0.13417914509773254, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9589 + }, + { + "epoch": 7.659744408945687, + "grad_norm": 0.11718209832906723, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9590 + }, + { + "epoch": 7.6605431309904155, + "grad_norm": 0.11689312011003494, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9591 + }, + { + "epoch": 7.661341853035144, + "grad_norm": 0.14541427791118622, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9592 + }, + { + "epoch": 7.662140575079873, + "grad_norm": 0.325338751077652, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 9593 + }, + { + "epoch": 7.6629392971246, + "grad_norm": 0.13927216827869415, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9594 + }, + { + "epoch": 7.663738019169329, + "grad_norm": 0.06451129913330078, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9595 + }, + { + "epoch": 7.664536741214057, + "grad_norm": 0.12422754615545273, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9596 + }, + { + "epoch": 7.665335463258786, + "grad_norm": 0.10147815197706223, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9597 + }, + { + "epoch": 7.666134185303514, + "grad_norm": 0.168707475066185, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9598 + }, + { + "epoch": 7.666932907348243, + "grad_norm": 0.13256248831748962, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9599 + }, + { + "epoch": 7.667731629392971, + "grad_norm": 0.10466179251670837, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9600 + }, + { + "epoch": 7.6685303514377, + "grad_norm": 0.1508362740278244, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9601 + }, + { + "epoch": 7.669329073482428, + "grad_norm": 0.10080639272928238, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9602 + }, + { + "epoch": 7.670127795527156, + "grad_norm": 0.2546437382698059, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9603 + }, + { + "epoch": 7.6709265175718855, + "grad_norm": 0.119930200278759, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9604 + }, + { + "epoch": 7.671725239616613, + "grad_norm": 0.12494632601737976, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9605 + }, + { + "epoch": 7.672523961661342, + "grad_norm": 0.2126263678073883, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9606 + }, + { + "epoch": 7.67332268370607, + "grad_norm": 0.058003541082143784, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 9607 + }, + { + "epoch": 7.674121405750799, + "grad_norm": 0.16652998328208923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9608 + }, + { + "epoch": 7.674920127795527, + "grad_norm": 0.18094705045223236, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9609 + }, + { + "epoch": 7.675718849840256, + "grad_norm": 0.21123206615447998, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9610 + }, + { + "epoch": 7.676517571884984, + "grad_norm": 0.14626245200634003, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 9611 + }, + { + "epoch": 7.677316293929713, + "grad_norm": 0.16082580387592316, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9612 + }, + { + "epoch": 7.678115015974441, + "grad_norm": 0.2231828272342682, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9613 + }, + { + "epoch": 7.678913738019169, + "grad_norm": 0.0519767664372921, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9614 + }, + { + "epoch": 7.6797124600638975, + "grad_norm": 0.11062952876091003, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9615 + }, + { + "epoch": 7.680511182108626, + "grad_norm": 0.1141565814614296, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9616 + }, + { + "epoch": 7.681309904153355, + "grad_norm": 0.05675462633371353, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9617 + }, + { + "epoch": 7.682108626198083, + "grad_norm": 0.06369207054376602, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9618 + }, + { + "epoch": 7.682907348242812, + "grad_norm": 0.6857787370681763, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9619 + }, + { + "epoch": 7.68370607028754, + "grad_norm": 0.14775703847408295, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9620 + }, + { + "epoch": 7.684504792332269, + "grad_norm": 0.11832108348608017, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9621 + }, + { + "epoch": 7.685303514376997, + "grad_norm": 0.0789126604795456, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9622 + }, + { + "epoch": 7.686102236421725, + "grad_norm": 0.09771233052015305, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9623 + }, + { + "epoch": 7.686900958466453, + "grad_norm": 0.1002877801656723, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9624 + }, + { + "epoch": 7.687699680511182, + "grad_norm": 0.09265508502721786, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9625 + }, + { + "epoch": 7.68849840255591, + "grad_norm": 0.18757182359695435, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9626 + }, + { + "epoch": 7.689297124600639, + "grad_norm": 0.07585754990577698, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9627 + }, + { + "epoch": 7.6900958466453675, + "grad_norm": 0.08716554194688797, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9628 + }, + { + "epoch": 7.690894568690096, + "grad_norm": 0.12742596864700317, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9629 + }, + { + "epoch": 7.6916932907348246, + "grad_norm": 0.3201116621494293, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9630 + }, + { + "epoch": 7.692492012779553, + "grad_norm": 0.13922421634197235, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9631 + }, + { + "epoch": 7.693290734824281, + "grad_norm": 0.1482384353876114, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9632 + }, + { + "epoch": 7.694089456869009, + "grad_norm": 0.44062909483909607, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9633 + }, + { + "epoch": 7.694888178913738, + "grad_norm": 0.09945067763328552, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9634 + }, + { + "epoch": 7.695686900958466, + "grad_norm": 0.09670528769493103, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9635 + }, + { + "epoch": 7.696485623003195, + "grad_norm": 0.21770933270454407, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9636 + }, + { + "epoch": 7.697284345047923, + "grad_norm": 0.08205332607030869, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9637 + }, + { + "epoch": 7.698083067092652, + "grad_norm": 0.20427794754505157, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9638 + }, + { + "epoch": 7.69888178913738, + "grad_norm": 0.10897956788539886, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9639 + }, + { + "epoch": 7.699680511182109, + "grad_norm": 0.09363125264644623, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9640 + }, + { + "epoch": 7.700479233226837, + "grad_norm": 0.11652582138776779, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9641 + }, + { + "epoch": 7.701277955271565, + "grad_norm": 0.11023811250925064, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9642 + }, + { + "epoch": 7.702076677316294, + "grad_norm": 0.0836176723241806, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9643 + }, + { + "epoch": 7.702875399361022, + "grad_norm": 0.16838227212429047, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9644 + }, + { + "epoch": 7.703674121405751, + "grad_norm": 0.0736071765422821, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9645 + }, + { + "epoch": 7.704472843450479, + "grad_norm": 0.12039043009281158, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9646 + }, + { + "epoch": 7.705271565495208, + "grad_norm": 0.08551492542028427, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9647 + }, + { + "epoch": 7.706070287539936, + "grad_norm": 0.4940882921218872, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9648 + }, + { + "epoch": 7.706869009584665, + "grad_norm": 0.09832077473402023, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9649 + }, + { + "epoch": 7.707667731629393, + "grad_norm": 0.059512801468372345, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9650 + }, + { + "epoch": 7.708466453674122, + "grad_norm": 0.10426498204469681, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9651 + }, + { + "epoch": 7.7092651757188495, + "grad_norm": 0.07802911102771759, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9652 + }, + { + "epoch": 7.710063897763578, + "grad_norm": 0.07615980505943298, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9653 + }, + { + "epoch": 7.710862619808307, + "grad_norm": 0.0960751548409462, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9654 + }, + { + "epoch": 7.711661341853035, + "grad_norm": 0.2435871958732605, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9655 + }, + { + "epoch": 7.712460063897764, + "grad_norm": 0.05712791904807091, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9656 + }, + { + "epoch": 7.713258785942492, + "grad_norm": 0.08460236340761185, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9657 + }, + { + "epoch": 7.714057507987221, + "grad_norm": 0.07319195568561554, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9658 + }, + { + "epoch": 7.714856230031949, + "grad_norm": 0.09150193631649017, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9659 + }, + { + "epoch": 7.715654952076678, + "grad_norm": 0.1096913143992424, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9660 + }, + { + "epoch": 7.716453674121405, + "grad_norm": 0.0675668716430664, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9661 + }, + { + "epoch": 7.717252396166134, + "grad_norm": 0.0719466358423233, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9662 + }, + { + "epoch": 7.718051118210862, + "grad_norm": 0.0392761304974556, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9663 + }, + { + "epoch": 7.718849840255591, + "grad_norm": 0.0673295333981514, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9664 + }, + { + "epoch": 7.7196485623003195, + "grad_norm": 0.10867837816476822, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9665 + }, + { + "epoch": 7.720447284345048, + "grad_norm": 0.6895002126693726, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9666 + }, + { + "epoch": 7.7212460063897765, + "grad_norm": 0.09527186304330826, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9667 + }, + { + "epoch": 7.722044728434505, + "grad_norm": 0.11535433679819107, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9668 + }, + { + "epoch": 7.722843450479234, + "grad_norm": 0.08127015084028244, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9669 + }, + { + "epoch": 7.723642172523961, + "grad_norm": 0.06600163877010345, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9670 + }, + { + "epoch": 7.72444089456869, + "grad_norm": 0.1283862143754959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9671 + }, + { + "epoch": 7.725239616613418, + "grad_norm": 0.04981343448162079, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9672 + }, + { + "epoch": 7.726038338658147, + "grad_norm": 0.08641577512025833, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9673 + }, + { + "epoch": 7.726837060702875, + "grad_norm": 0.0503465011715889, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9674 + }, + { + "epoch": 7.727635782747604, + "grad_norm": 0.08342859894037247, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9675 + }, + { + "epoch": 7.728434504792332, + "grad_norm": 0.11919692903757095, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 9676 + }, + { + "epoch": 7.729233226837061, + "grad_norm": 0.2119137942790985, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9677 + }, + { + "epoch": 7.7300319488817895, + "grad_norm": 0.04871589317917824, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9678 + }, + { + "epoch": 7.730830670926517, + "grad_norm": 0.09571115672588348, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 9679 + }, + { + "epoch": 7.731629392971246, + "grad_norm": 0.07192373275756836, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9680 + }, + { + "epoch": 7.732428115015974, + "grad_norm": 1.5483330488204956, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9681 + }, + { + "epoch": 7.733226837060703, + "grad_norm": 0.11159799993038177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9682 + }, + { + "epoch": 7.734025559105431, + "grad_norm": 0.1700333058834076, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9683 + }, + { + "epoch": 7.73482428115016, + "grad_norm": 0.06227154657244682, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9684 + }, + { + "epoch": 7.735623003194888, + "grad_norm": 0.46623915433883667, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9685 + }, + { + "epoch": 7.736421725239617, + "grad_norm": 0.235361710190773, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9686 + }, + { + "epoch": 7.737220447284345, + "grad_norm": 0.11328862607479095, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9687 + }, + { + "epoch": 7.738019169329074, + "grad_norm": 0.05050930753350258, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9688 + }, + { + "epoch": 7.738817891373802, + "grad_norm": 0.12438485771417618, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9689 + }, + { + "epoch": 7.73961661341853, + "grad_norm": 0.08985885977745056, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9690 + }, + { + "epoch": 7.7404153354632586, + "grad_norm": 0.09222928434610367, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9691 + }, + { + "epoch": 7.741214057507987, + "grad_norm": 0.0879027396440506, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9692 + }, + { + "epoch": 7.742012779552716, + "grad_norm": 0.18554086983203888, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9693 + }, + { + "epoch": 7.742811501597444, + "grad_norm": 0.197623610496521, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9694 + }, + { + "epoch": 7.743610223642173, + "grad_norm": 0.07009958475828171, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9695 + }, + { + "epoch": 7.744408945686901, + "grad_norm": 0.059514936059713364, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9696 + }, + { + "epoch": 7.74520766773163, + "grad_norm": 0.6503719687461853, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9697 + }, + { + "epoch": 7.746006389776358, + "grad_norm": 0.4739440977573395, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9698 + }, + { + "epoch": 7.746805111821086, + "grad_norm": 0.15581674873828888, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9699 + }, + { + "epoch": 7.747603833865814, + "grad_norm": 0.3622123897075653, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9700 + }, + { + "epoch": 7.748402555910543, + "grad_norm": 0.16665314137935638, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 9701 + }, + { + "epoch": 7.7492012779552715, + "grad_norm": 0.2903657853603363, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9702 + }, + { + "epoch": 7.75, + "grad_norm": 0.1619565337896347, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9703 + }, + { + "epoch": 7.7507987220447285, + "grad_norm": 0.816677987575531, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9704 + }, + { + "epoch": 7.751597444089457, + "grad_norm": 0.20582620799541473, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9705 + }, + { + "epoch": 7.752396166134186, + "grad_norm": 0.24247393012046814, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 9706 + }, + { + "epoch": 7.753194888178914, + "grad_norm": 0.2543368339538574, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9707 + }, + { + "epoch": 7.753993610223642, + "grad_norm": 0.13885188102722168, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9708 + }, + { + "epoch": 7.75479233226837, + "grad_norm": 0.22553203999996185, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9709 + }, + { + "epoch": 7.755591054313099, + "grad_norm": 0.4961666762828827, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9710 + }, + { + "epoch": 7.756389776357827, + "grad_norm": 0.15139277279376984, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9711 + }, + { + "epoch": 7.757188498402556, + "grad_norm": 0.1196078360080719, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9712 + }, + { + "epoch": 7.757987220447284, + "grad_norm": 0.21309585869312286, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9713 + }, + { + "epoch": 7.758785942492013, + "grad_norm": 0.07756591588258743, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 9714 + }, + { + "epoch": 7.7595846645367414, + "grad_norm": 0.1986755132675171, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9715 + }, + { + "epoch": 7.76038338658147, + "grad_norm": 0.08994139730930328, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9716 + }, + { + "epoch": 7.761182108626198, + "grad_norm": 0.19416365027427673, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9717 + }, + { + "epoch": 7.761980830670926, + "grad_norm": 0.08997556567192078, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9718 + }, + { + "epoch": 7.762779552715655, + "grad_norm": 0.16295979917049408, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9719 + }, + { + "epoch": 7.763578274760383, + "grad_norm": 0.15271921455860138, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9720 + }, + { + "epoch": 7.764376996805112, + "grad_norm": 0.15118274092674255, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 9721 + }, + { + "epoch": 7.76517571884984, + "grad_norm": 0.14820800721645355, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9722 + }, + { + "epoch": 7.765974440894569, + "grad_norm": 0.08788670599460602, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9723 + }, + { + "epoch": 7.766773162939297, + "grad_norm": 0.13634555041790009, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9724 + }, + { + "epoch": 7.767571884984026, + "grad_norm": 0.3266567885875702, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 9725 + }, + { + "epoch": 7.768370607028754, + "grad_norm": 0.14486448466777802, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9726 + }, + { + "epoch": 7.769169329073483, + "grad_norm": 0.1453651785850525, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9727 + }, + { + "epoch": 7.7699680511182105, + "grad_norm": 0.09860636293888092, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9728 + }, + { + "epoch": 7.770766773162939, + "grad_norm": 0.1391478180885315, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 9729 + }, + { + "epoch": 7.771565495207668, + "grad_norm": 0.09883351624011993, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9730 + }, + { + "epoch": 7.772364217252396, + "grad_norm": 0.08394116163253784, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9731 + }, + { + "epoch": 7.773162939297125, + "grad_norm": 0.09769196063280106, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9732 + }, + { + "epoch": 7.773961661341853, + "grad_norm": 0.13514496386051178, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9733 + }, + { + "epoch": 7.774760383386582, + "grad_norm": 0.042965635657310486, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9734 + }, + { + "epoch": 7.77555910543131, + "grad_norm": 0.1645607203245163, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9735 + }, + { + "epoch": 7.776357827476039, + "grad_norm": 0.07206106185913086, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9736 + }, + { + "epoch": 7.777156549520766, + "grad_norm": 0.12476811558008194, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9737 + }, + { + "epoch": 7.777955271565495, + "grad_norm": 0.13698029518127441, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9738 + }, + { + "epoch": 7.7787539936102235, + "grad_norm": 0.06305114924907684, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9739 + }, + { + "epoch": 7.779552715654952, + "grad_norm": 0.08472646027803421, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9740 + }, + { + "epoch": 7.7803514376996805, + "grad_norm": 0.11592312157154083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9741 + }, + { + "epoch": 7.781150159744409, + "grad_norm": 0.1425880789756775, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9742 + }, + { + "epoch": 7.781948881789138, + "grad_norm": 0.15640930831432343, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9743 + }, + { + "epoch": 7.782747603833866, + "grad_norm": 0.10394492000341415, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9744 + }, + { + "epoch": 7.783546325878595, + "grad_norm": 0.11625290662050247, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9745 + }, + { + "epoch": 7.784345047923322, + "grad_norm": 0.10535796731710434, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9746 + }, + { + "epoch": 7.785143769968051, + "grad_norm": 3.235619068145752, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9747 + }, + { + "epoch": 7.785942492012779, + "grad_norm": 0.15474911034107208, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9748 + }, + { + "epoch": 7.786741214057508, + "grad_norm": 0.15647299587726593, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9749 + }, + { + "epoch": 7.787539936102236, + "grad_norm": 0.09747028350830078, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9750 + }, + { + "epoch": 7.788338658146965, + "grad_norm": 0.14497286081314087, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 9751 + }, + { + "epoch": 7.789137380191693, + "grad_norm": 0.07434806227684021, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9752 + }, + { + "epoch": 7.789936102236422, + "grad_norm": 0.15246541798114777, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9753 + }, + { + "epoch": 7.7907348242811505, + "grad_norm": 0.14904089272022247, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9754 + }, + { + "epoch": 7.791533546325878, + "grad_norm": 0.6353806257247925, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9755 + }, + { + "epoch": 7.792332268370607, + "grad_norm": 0.10378783196210861, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9756 + }, + { + "epoch": 7.793130990415335, + "grad_norm": 0.0953051820397377, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9757 + }, + { + "epoch": 7.793929712460064, + "grad_norm": 0.2233702540397644, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 9758 + }, + { + "epoch": 7.794728434504792, + "grad_norm": 0.19136221706867218, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9759 + }, + { + "epoch": 7.795527156549521, + "grad_norm": 0.2648071348667145, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9760 + }, + { + "epoch": 7.796325878594249, + "grad_norm": 0.22395290434360504, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9761 + }, + { + "epoch": 7.797124600638978, + "grad_norm": 0.6649449467658997, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9762 + }, + { + "epoch": 7.797923322683706, + "grad_norm": 0.20980334281921387, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9763 + }, + { + "epoch": 7.798722044728435, + "grad_norm": 0.14185716211795807, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 9764 + }, + { + "epoch": 7.799520766773163, + "grad_norm": 0.22333350777626038, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9765 + }, + { + "epoch": 7.800319488817891, + "grad_norm": 0.17339012026786804, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9766 + }, + { + "epoch": 7.80111821086262, + "grad_norm": 0.19127756357192993, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 9767 + }, + { + "epoch": 7.801916932907348, + "grad_norm": 0.15001042187213898, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9768 + }, + { + "epoch": 7.802715654952077, + "grad_norm": 0.1899043172597885, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9769 + }, + { + "epoch": 7.803514376996805, + "grad_norm": 0.1115068718791008, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9770 + }, + { + "epoch": 7.804313099041534, + "grad_norm": 6.017269134521484, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9771 + }, + { + "epoch": 7.805111821086262, + "grad_norm": 22.341394424438477, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9772 + }, + { + "epoch": 7.805910543130991, + "grad_norm": 88.28079223632812, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 9773 + }, + { + "epoch": 7.806709265175719, + "grad_norm": 461.5246887207031, + "learning_rate": 0.0005, + "loss": 1.083, + "step": 9774 + }, + { + "epoch": 7.807507987220447, + "grad_norm": 364435968.0, + "learning_rate": 0.0005, + "loss": 1.1099, + "step": 9775 + }, + { + "epoch": 7.8083067092651754, + "grad_norm": 994890816.0, + "learning_rate": 0.0005, + "loss": 1.1722, + "step": 9776 + }, + { + "epoch": 7.809105431309904, + "grad_norm": 6027866112.0, + "learning_rate": 0.0005, + "loss": 1.2413, + "step": 9777 + }, + { + "epoch": 7.8099041533546325, + "grad_norm": Infinity, + "learning_rate": 0.0005, + "loss": 1.3254, + "step": 9778 + }, + { + "epoch": 7.810702875399361, + "grad_norm": Infinity, + "learning_rate": 0.0005, + "loss": 1.4019, + "step": 9779 + }, + { + "epoch": 7.81150159744409, + "grad_norm": Infinity, + "learning_rate": 0.0005, + "loss": 1.4696, + "step": 9780 + }, + { + "epoch": 7.812300319488818, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 1.5684, + "step": 9781 + }, + { + "epoch": 7.813099041533547, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9782 + }, + { + "epoch": 7.813897763578275, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9783 + }, + { + "epoch": 7.814696485623003, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9784 + }, + { + "epoch": 7.815495207667731, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9785 + }, + { + "epoch": 7.81629392971246, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9786 + }, + { + "epoch": 7.817092651757188, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9787 + }, + { + "epoch": 7.817891373801917, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9788 + }, + { + "epoch": 7.818690095846645, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9789 + }, + { + "epoch": 7.819488817891374, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9790 + }, + { + "epoch": 7.8202875399361025, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9791 + }, + { + "epoch": 7.821086261980831, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9792 + }, + { + "epoch": 7.821884984025559, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9793 + }, + { + "epoch": 7.822683706070287, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9794 + }, + { + "epoch": 7.823482428115016, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9795 + }, + { + "epoch": 7.824281150159744, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9796 + }, + { + "epoch": 7.825079872204473, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9797 + }, + { + "epoch": 7.825878594249201, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9798 + }, + { + "epoch": 7.82667731629393, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9799 + }, + { + "epoch": 7.827476038338658, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9800 + }, + { + "epoch": 7.828274760383387, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9801 + }, + { + "epoch": 7.8290734824281145, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9802 + }, + { + "epoch": 7.829872204472844, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9803 + }, + { + "epoch": 7.830670926517572, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9804 + }, + { + "epoch": 7.8314696485623, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9805 + }, + { + "epoch": 7.832268370607029, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9806 + }, + { + "epoch": 7.833067092651757, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9807 + }, + { + "epoch": 7.833865814696486, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9808 + }, + { + "epoch": 7.834664536741214, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9809 + }, + { + "epoch": 7.835463258785943, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9810 + }, + { + "epoch": 7.836261980830671, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9811 + }, + { + "epoch": 7.8370607028754, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9812 + }, + { + "epoch": 7.837859424920127, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9813 + }, + { + "epoch": 7.838658146964856, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9814 + }, + { + "epoch": 7.8394568690095845, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9815 + }, + { + "epoch": 7.840255591054313, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9816 + }, + { + "epoch": 7.8410543130990416, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9817 + }, + { + "epoch": 7.84185303514377, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9818 + }, + { + "epoch": 7.842651757188499, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9819 + }, + { + "epoch": 7.843450479233227, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9820 + }, + { + "epoch": 7.844249201277956, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9821 + }, + { + "epoch": 7.845047923322683, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9822 + }, + { + "epoch": 7.845846645367412, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9823 + }, + { + "epoch": 7.84664536741214, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9824 + }, + { + "epoch": 7.847444089456869, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9825 + }, + { + "epoch": 7.848242811501597, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9826 + }, + { + "epoch": 7.849041533546326, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9827 + }, + { + "epoch": 7.8498402555910545, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9828 + }, + { + "epoch": 7.850638977635783, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9829 + }, + { + "epoch": 7.8514376996805115, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9830 + }, + { + "epoch": 7.852236421725239, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9831 + }, + { + "epoch": 7.853035143769968, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9832 + }, + { + "epoch": 7.853833865814696, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9833 + }, + { + "epoch": 7.854632587859425, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9834 + }, + { + "epoch": 7.855431309904153, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9835 + }, + { + "epoch": 7.856230031948882, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9836 + }, + { + "epoch": 7.85702875399361, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9837 + }, + { + "epoch": 7.857827476038339, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9838 + }, + { + "epoch": 7.858626198083067, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9839 + }, + { + "epoch": 7.859424920127795, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9840 + }, + { + "epoch": 7.8602236421725244, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9841 + }, + { + "epoch": 7.861022364217252, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9842 + }, + { + "epoch": 7.861821086261981, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9843 + }, + { + "epoch": 7.862619808306709, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9844 + }, + { + "epoch": 7.863418530351438, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9845 + }, + { + "epoch": 7.864217252396166, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9846 + }, + { + "epoch": 7.865015974440895, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9847 + }, + { + "epoch": 7.865814696485623, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9848 + }, + { + "epoch": 7.866613418530352, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9849 + }, + { + "epoch": 7.86741214057508, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9850 + }, + { + "epoch": 7.868210862619808, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9851 + }, + { + "epoch": 7.8690095846645365, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9852 + }, + { + "epoch": 7.869808306709265, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9853 + }, + { + "epoch": 7.8706070287539935, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9854 + }, + { + "epoch": 7.871405750798722, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9855 + }, + { + "epoch": 7.872204472843451, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9856 + }, + { + "epoch": 7.873003194888179, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9857 + }, + { + "epoch": 7.873801916932908, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9858 + }, + { + "epoch": 7.874600638977636, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9859 + }, + { + "epoch": 7.875399361022364, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9860 + }, + { + "epoch": 7.876198083067092, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9861 + }, + { + "epoch": 7.876996805111821, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9862 + }, + { + "epoch": 7.877795527156549, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9863 + }, + { + "epoch": 7.878594249201278, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9864 + }, + { + "epoch": 7.8793929712460065, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9865 + }, + { + "epoch": 7.880191693290735, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9866 + }, + { + "epoch": 7.8809904153354635, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9867 + }, + { + "epoch": 7.881789137380192, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9868 + }, + { + "epoch": 7.88258785942492, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9869 + }, + { + "epoch": 7.883386581469648, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9870 + }, + { + "epoch": 7.884185303514377, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9871 + }, + { + "epoch": 7.884984025559105, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9872 + }, + { + "epoch": 7.885782747603834, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9873 + }, + { + "epoch": 7.886581469648562, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9874 + }, + { + "epoch": 7.887380191693291, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9875 + }, + { + "epoch": 7.888178913738019, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9876 + }, + { + "epoch": 7.888977635782748, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9877 + }, + { + "epoch": 7.8897763578274756, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9878 + }, + { + "epoch": 7.890575079872205, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9879 + }, + { + "epoch": 7.891373801916933, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9880 + }, + { + "epoch": 7.892172523961661, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9881 + }, + { + "epoch": 7.89297124600639, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9882 + }, + { + "epoch": 7.893769968051118, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9883 + }, + { + "epoch": 7.894568690095847, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9884 + }, + { + "epoch": 7.895367412140575, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9885 + }, + { + "epoch": 7.896166134185304, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9886 + }, + { + "epoch": 7.896964856230032, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9887 + }, + { + "epoch": 7.897763578274761, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9888 + }, + { + "epoch": 7.8985623003194885, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9889 + }, + { + "epoch": 7.899361022364217, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9890 + }, + { + "epoch": 7.9001597444089455, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9891 + }, + { + "epoch": 7.900958466453674, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9892 + }, + { + "epoch": 7.901757188498403, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9893 + }, + { + "epoch": 7.902555910543131, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9894 + }, + { + "epoch": 7.90335463258786, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9895 + }, + { + "epoch": 7.904153354632588, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9896 + }, + { + "epoch": 7.904952076677317, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9897 + }, + { + "epoch": 7.905750798722044, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9898 + }, + { + "epoch": 7.906549520766773, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9899 + }, + { + "epoch": 7.907348242811501, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9900 + }, + { + "epoch": 7.90814696485623, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9901 + }, + { + "epoch": 7.9089456869009584, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9902 + }, + { + "epoch": 7.909744408945687, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9903 + }, + { + "epoch": 7.9105431309904155, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9904 + }, + { + "epoch": 7.911341853035144, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9905 + }, + { + "epoch": 7.912140575079873, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9906 + }, + { + "epoch": 7.9129392971246, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9907 + }, + { + "epoch": 7.913738019169329, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9908 + }, + { + "epoch": 7.914536741214057, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9909 + }, + { + "epoch": 7.915335463258786, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9910 + }, + { + "epoch": 7.916134185303514, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9911 + }, + { + "epoch": 7.916932907348243, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9912 + }, + { + "epoch": 7.917731629392971, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9913 + }, + { + "epoch": 7.9185303514377, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9914 + }, + { + "epoch": 7.919329073482428, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9915 + }, + { + "epoch": 7.920127795527156, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9916 + }, + { + "epoch": 7.9209265175718855, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9917 + }, + { + "epoch": 7.921725239616613, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9918 + }, + { + "epoch": 7.922523961661342, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9919 + }, + { + "epoch": 7.92332268370607, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9920 + }, + { + "epoch": 7.924121405750799, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9921 + }, + { + "epoch": 7.924920127795527, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9922 + }, + { + "epoch": 7.925718849840256, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9923 + }, + { + "epoch": 7.926517571884984, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9924 + }, + { + "epoch": 7.927316293929713, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9925 + }, + { + "epoch": 7.928115015974441, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9926 + }, + { + "epoch": 7.928913738019169, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9927 + }, + { + "epoch": 7.9297124600638975, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9928 + }, + { + "epoch": 7.930511182108626, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9929 + }, + { + "epoch": 7.931309904153355, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9930 + }, + { + "epoch": 7.932108626198083, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9931 + }, + { + "epoch": 7.932907348242812, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9932 + }, + { + "epoch": 7.93370607028754, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9933 + }, + { + "epoch": 7.934504792332269, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9934 + }, + { + "epoch": 7.935303514376997, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9935 + }, + { + "epoch": 7.936102236421725, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9936 + }, + { + "epoch": 7.936900958466453, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9937 + }, + { + "epoch": 7.937699680511182, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9938 + }, + { + "epoch": 7.93849840255591, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9939 + }, + { + "epoch": 7.939297124600639, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9940 + }, + { + "epoch": 7.9400958466453675, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9941 + }, + { + "epoch": 7.940894568690096, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9942 + }, + { + "epoch": 7.9416932907348246, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9943 + }, + { + "epoch": 7.942492012779553, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9944 + }, + { + "epoch": 7.943290734824281, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9945 + }, + { + "epoch": 7.944089456869009, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9946 + }, + { + "epoch": 7.944888178913738, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9947 + }, + { + "epoch": 7.945686900958466, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9948 + }, + { + "epoch": 7.946485623003195, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9949 + }, + { + "epoch": 7.947284345047923, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9950 + }, + { + "epoch": 7.948083067092652, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9951 + }, + { + "epoch": 7.94888178913738, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9952 + }, + { + "epoch": 7.949680511182109, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9953 + }, + { + "epoch": 7.950479233226837, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9954 + }, + { + "epoch": 7.951277955271565, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9955 + }, + { + "epoch": 7.952076677316294, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9956 + }, + { + "epoch": 7.952875399361022, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9957 + }, + { + "epoch": 7.953674121405751, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9958 + }, + { + "epoch": 7.954472843450479, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9959 + }, + { + "epoch": 7.955271565495208, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9960 + }, + { + "epoch": 7.956070287539936, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9961 + }, + { + "epoch": 7.956869009584665, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9962 + }, + { + "epoch": 7.957667731629393, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9963 + }, + { + "epoch": 7.958466453674122, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9964 + }, + { + "epoch": 7.9592651757188495, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9965 + }, + { + "epoch": 7.960063897763578, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9966 + }, + { + "epoch": 7.960862619808307, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9967 + }, + { + "epoch": 7.961661341853035, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9968 + }, + { + "epoch": 7.962460063897764, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9969 + }, + { + "epoch": 7.963258785942492, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9970 + }, + { + "epoch": 7.964057507987221, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9971 + }, + { + "epoch": 7.964856230031949, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9972 + }, + { + "epoch": 7.965654952076678, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9973 + }, + { + "epoch": 7.966453674121405, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9974 + }, + { + "epoch": 7.967252396166134, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9975 + }, + { + "epoch": 7.968051118210862, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9976 + }, + { + "epoch": 7.968849840255591, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9977 + }, + { + "epoch": 7.9696485623003195, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9978 + }, + { + "epoch": 7.970447284345048, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9979 + }, + { + "epoch": 7.9712460063897765, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9980 + }, + { + "epoch": 7.972044728434505, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9981 + }, + { + "epoch": 7.972843450479234, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9982 + }, + { + "epoch": 7.973642172523961, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9983 + }, + { + "epoch": 7.97444089456869, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9984 + }, + { + "epoch": 7.975239616613418, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9985 + }, + { + "epoch": 7.976038338658147, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9986 + }, + { + "epoch": 7.976837060702875, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9987 + }, + { + "epoch": 7.977635782747604, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9988 + }, + { + "epoch": 7.978434504792332, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9989 + }, + { + "epoch": 7.979233226837061, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9990 + }, + { + "epoch": 7.9800319488817895, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9991 + }, + { + "epoch": 7.980830670926517, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9992 + }, + { + "epoch": 7.981629392971246, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9993 + }, + { + "epoch": 7.982428115015974, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9994 + }, + { + "epoch": 7.983226837060703, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9995 + }, + { + "epoch": 7.984025559105431, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9996 + }, + { + "epoch": 7.98482428115016, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9997 + }, + { + "epoch": 7.985623003194888, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9998 + }, + { + "epoch": 7.986421725239617, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 9999 + }, + { + "epoch": 7.987220447284345, + "grad_norm": NaN, + "learning_rate": 0.0005, + "loss": 0.0, + "step": 10000 + } + ], + "logging_steps": 1.0, + "max_steps": 751200, + "num_input_tokens_seen": 0, + "num_train_epochs": 600, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.548817116436103e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10000/training_args.bin b/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0162074424e3714af8119d3be2b6e69cbb5b9f2 --- /dev/null +++ b/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06816c37733f99d23f044cefd981b2f404a72ddf40fa59f794154596b842fa95 +size 6072 diff --git a/checkpoint-9250/config.json b/checkpoint-9250/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab0ae1ba49f17c446b66e627e5e96aa2c97bb02d --- /dev/null +++ b/checkpoint-9250/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "checkpoints/vlm_dc-vae-f32c32-sana-1.1_layerwise-0_group-7/checkpoint-9250", + "ar_steps": 1, + "architectures": [ + "DiffVLMDiffusion" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "condition_layer": -1, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "image_token_id": 151655, + "img_cross_attention_dim": 2048, + "img_diffuser_depth": 6, + "img_ffn_dim_multiplier": null, + "img_hidden_size": 1536, + "img_multiple_of": 256, + "img_norm_eps": 1e-05, + "img_num_attention_heads": 12, + "img_num_kv_heads": 12, + "img_qk_norm": true, + "in_channels": 32, + "initializer_range": 0.02, + "inject_img_diffuser": false, + "input_size": 32, + "intermediate_size": 8960, + "layer_group_size": 7, + "layerwise_start_idx": 0, + "lora_alpha": 256, + "lora_bias": "none", + "lora_dropout": 0.05, + "lora_enable": false, + "lora_r": 128, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_vl", + "non_linearity": 1, + "norm_elementwise_affine": true, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "patch_size": 2, + "repa_coeff": 0.1, + "repa_layers": "2", + "repa_shared": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sample_size": 128, + "sampling_steps": 28, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.47.0", + "use_cache": true, + "use_repa": false, + "use_residual_attn": false, + "use_sliding_window": false, + "vae_path": "mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers", + "video_token_id": 151656, + "vision_config": { + "hidden_size": 1536, + "in_chans": 3, + "model_type": "qwen2_vl", + "spatial_patch_size": 14 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/checkpoint-9250/generation_config.json b/checkpoint-9250/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3d465da90cb0ab59d8ba7babdefb0e88fbfa6b --- /dev/null +++ b/checkpoint-9250/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": 151645, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-9250/model-00001-of-00002.safetensors b/checkpoint-9250/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac988cf9b196a10b03615bf70d252c72de67a98e --- /dev/null +++ b/checkpoint-9250/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d1cbeeefbc8b3d0c08c879cd937dd13a38dffc753d81d8733af02c7f8f260c +size 4998598816 diff --git a/checkpoint-9250/model-00002-of-00002.safetensors b/checkpoint-9250/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96e88469ad721e7c5fdb2254f72e79391110a1a3 --- /dev/null +++ b/checkpoint-9250/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39517e88777c7799bf502fcb9668738b511e64ad8fcb899c8472c030637fa531 +size 4990560652 diff --git a/checkpoint-9250/model.safetensors.index.json b/checkpoint-9250/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..b3b85b852967adb370204fb2c3e3d18822b10ab5 --- /dev/null +++ b/checkpoint-9250/model.safetensors.index.json @@ -0,0 +1,1740 @@ +{ + "metadata": { + "total_size": 9988962252 + }, + "weight_map": { + "embed_tokens.weight": "model-00002-of-00002.safetensors", + "img2text.bias": "model-00001-of-00002.safetensors", + "img2text.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.weight": "model-00001-of-00002.safetensors", + "layers.0.gate": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.1.gate": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.10.gate": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.11.gate": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.12.gate": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.13.gate": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.14.gate": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.15.gate": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.16.gate": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.17.gate": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.17.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.17.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.18.gate": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.19.gate": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.2.gate": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.20.gate": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.21.gate": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.22.gate": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.23.gate": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.24.gate": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.25.gate": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.26.gate": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.27.gate": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.3.gate": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.4.gate": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.5.gate": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.6.gate": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.7.gate": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.8.gate": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.9.gate": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "norm.weight": "model-00002-of-00002.safetensors", + "patch_embedder.proj.bias": "model-00001-of-00002.safetensors", + "patch_embedder.proj.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.weight": "model-00001-of-00002.safetensors", + "vae.decoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-9250/optimizer.pt b/checkpoint-9250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6ba11e6bfdbe60e7373e9d602b4e6e9c609fc9d --- /dev/null +++ b/checkpoint-9250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14855296a297fc3cb819a8f1e26fc414f9a01a3d78619c1aaed7173dd7afc238 +size 15084326534 diff --git a/checkpoint-9250/rng_state_0.pth b/checkpoint-9250/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c39a2905fed26e2068186e6d352a99fdf953ea3a --- /dev/null +++ b/checkpoint-9250/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f15e5ecf2215416cccfb5acf8a5866062dda90abc7d426d4092d9a6ea5701def +size 15984 diff --git a/checkpoint-9250/rng_state_1.pth b/checkpoint-9250/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8c50925729dafff9749ed732b6fffc7b1f8fc6b --- /dev/null +++ b/checkpoint-9250/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5109a82f4afe291929d6f2f266869e9bc00730c2c499bb040e4ab93e1d5619b7 +size 15984 diff --git a/checkpoint-9250/rng_state_2.pth b/checkpoint-9250/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad665573e4f6d4f89c813e42ffdf2a951fb93519 --- /dev/null +++ b/checkpoint-9250/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c8fe603eebd1ec48f2340fdffa828296306012f9ae455b491957b35c0e0a5e +size 15984 diff --git a/checkpoint-9250/rng_state_3.pth b/checkpoint-9250/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..faf72be1e1781e89e0de46c44f9befbbedb877b4 --- /dev/null +++ b/checkpoint-9250/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c38efab8f4a1112161f5f95bbefe72f91e2090ed864dcb16b055f0492b4a164 +size 15984 diff --git a/checkpoint-9250/rng_state_4.pth b/checkpoint-9250/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..4a913c41eab26885e028ac5a7e95390387888887 --- /dev/null +++ b/checkpoint-9250/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d831da0f86a0ccb4250a89e2e4cfce06afc7e18e80371373af6370805271d56 +size 15984 diff --git a/checkpoint-9250/rng_state_5.pth b/checkpoint-9250/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..91fa6ccd9d0f809e79851b8340d1cdf354abd903 --- /dev/null +++ b/checkpoint-9250/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35c020fc505e1a396c96c70e74cebae3796c8f3779874b3643dbc2af655bd5ef +size 15984 diff --git a/checkpoint-9250/rng_state_6.pth b/checkpoint-9250/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..7cc78eb2849d10c90db6571dbe320d237d29b443 --- /dev/null +++ b/checkpoint-9250/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d90b2d8bff52e46d645fb76cb28243b3ad5922bb8e85430722888010afe993ac +size 15984 diff --git a/checkpoint-9250/rng_state_7.pth b/checkpoint-9250/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..121c3404c1a34f86362c722daa53f2fba958399e --- /dev/null +++ b/checkpoint-9250/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4444f1ac90b52bff550eeee9bbf3ee70f9daa59e5b6a608866b1b1c58aa2fe09 +size 15984 diff --git a/checkpoint-9250/scheduler.pt b/checkpoint-9250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..476758a37b5034f1e47808322c4b2f3ade79ab66 --- /dev/null +++ b/checkpoint-9250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88560f892e51e219f2e49d8136a9bbce7f33b04ca2ed8085600760b3201dbd9c +size 1064 diff --git a/checkpoint-9250/trainer_state.json b/checkpoint-9250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f627bb7712688f425a39988825cceff99f89b7a0 --- /dev/null +++ b/checkpoint-9250/trainer_state.json @@ -0,0 +1,64783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.388178913738019, + "eval_steps": 500, + "global_step": 9250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007987220447284345, + "grad_norm": 0.08758673816919327, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1 + }, + { + "epoch": 0.001597444089456869, + "grad_norm": 2.9034857749938965, + "learning_rate": 0.0005, + "loss": 1.5342, + "step": 2 + }, + { + "epoch": 0.0023961661341853034, + "grad_norm": 1.260856032371521, + "learning_rate": 0.0005, + "loss": 1.3074, + "step": 3 + }, + { + "epoch": 0.003194888178913738, + "grad_norm": 2.2480077743530273, + "learning_rate": 0.0005, + "loss": 1.3434, + "step": 4 + }, + { + "epoch": 0.003993610223642172, + "grad_norm": 0.6822420358657837, + "learning_rate": 0.0005, + "loss": 1.2075, + "step": 5 + }, + { + "epoch": 0.004792332268370607, + "grad_norm": 0.7826036214828491, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 6 + }, + { + "epoch": 0.005591054313099041, + "grad_norm": 0.690284788608551, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 7 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 0.49136775732040405, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 8 + }, + { + "epoch": 0.00718849840255591, + "grad_norm": 0.3124309182167053, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 9 + }, + { + "epoch": 0.007987220447284345, + "grad_norm": 0.3409576714038849, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 10 + }, + { + "epoch": 0.00878594249201278, + "grad_norm": 0.25508174300193787, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 11 + }, + { + "epoch": 0.009584664536741214, + "grad_norm": 0.19042040407657623, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 12 + }, + { + "epoch": 0.010383386581469648, + "grad_norm": 0.2090323120355606, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 13 + }, + { + "epoch": 0.011182108626198083, + "grad_norm": 0.2102068066596985, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 14 + }, + { + "epoch": 0.011980830670926517, + "grad_norm": 0.12789177894592285, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 15 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 0.10204717516899109, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 16 + }, + { + "epoch": 0.013578274760383386, + "grad_norm": 0.174830362200737, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 17 + }, + { + "epoch": 0.01437699680511182, + "grad_norm": 0.25637468695640564, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 18 + }, + { + "epoch": 0.015175718849840255, + "grad_norm": 0.28002411127090454, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 19 + }, + { + "epoch": 0.01597444089456869, + "grad_norm": 0.23047354817390442, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 20 + }, + { + "epoch": 0.016773162939297124, + "grad_norm": 0.1548614650964737, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 21 + }, + { + "epoch": 0.01757188498402556, + "grad_norm": 0.07078541815280914, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 22 + }, + { + "epoch": 0.018370607028753993, + "grad_norm": 0.10615550726652145, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 23 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 0.10240291804075241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 24 + }, + { + "epoch": 0.019968051118210862, + "grad_norm": 0.07588993012905121, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 25 + }, + { + "epoch": 0.020766773162939296, + "grad_norm": 0.06380276381969452, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 26 + }, + { + "epoch": 0.02156549520766773, + "grad_norm": 0.06891524791717529, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 27 + }, + { + "epoch": 0.022364217252396165, + "grad_norm": 0.0625377744436264, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 28 + }, + { + "epoch": 0.0231629392971246, + "grad_norm": 0.12064792215824127, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 29 + }, + { + "epoch": 0.023961661341853034, + "grad_norm": 0.29220151901245117, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 30 + }, + { + "epoch": 0.02476038338658147, + "grad_norm": 0.7822219729423523, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 31 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 1.5172864198684692, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 32 + }, + { + "epoch": 0.026357827476038338, + "grad_norm": 0.18434809148311615, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 33 + }, + { + "epoch": 0.027156549520766772, + "grad_norm": 0.535632848739624, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 34 + }, + { + "epoch": 0.027955271565495207, + "grad_norm": 0.21549028158187866, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 35 + }, + { + "epoch": 0.02875399361022364, + "grad_norm": 0.4726889431476593, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 36 + }, + { + "epoch": 0.029552715654952075, + "grad_norm": 0.2519988417625427, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 37 + }, + { + "epoch": 0.03035143769968051, + "grad_norm": 0.2973701059818268, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 38 + }, + { + "epoch": 0.031150159744408944, + "grad_norm": 0.30153587460517883, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 39 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 0.08746712654829025, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 40 + }, + { + "epoch": 0.03274760383386582, + "grad_norm": 0.3308769762516022, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 41 + }, + { + "epoch": 0.03354632587859425, + "grad_norm": 0.10948555171489716, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 42 + }, + { + "epoch": 0.034345047923322686, + "grad_norm": 0.3044797480106354, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 43 + }, + { + "epoch": 0.03514376996805112, + "grad_norm": 0.11677752435207367, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 44 + }, + { + "epoch": 0.035942492012779555, + "grad_norm": 0.30327609181404114, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 45 + }, + { + "epoch": 0.036741214057507986, + "grad_norm": 0.10603009909391403, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 46 + }, + { + "epoch": 0.037539936102236424, + "grad_norm": 0.2693077623844147, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 47 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.11918680369853973, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 48 + }, + { + "epoch": 0.03913738019169329, + "grad_norm": 0.2965734899044037, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 49 + }, + { + "epoch": 0.039936102236421724, + "grad_norm": 0.10428953915834427, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 50 + }, + { + "epoch": 0.04073482428115016, + "grad_norm": 0.23307208716869354, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 51 + }, + { + "epoch": 0.04153354632587859, + "grad_norm": 0.07401563227176666, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 52 + }, + { + "epoch": 0.04233226837060703, + "grad_norm": 0.22344312071800232, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 53 + }, + { + "epoch": 0.04313099041533546, + "grad_norm": 0.1782081127166748, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 54 + }, + { + "epoch": 0.0439297124600639, + "grad_norm": 0.10123606026172638, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 55 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 0.2618716359138489, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 56 + }, + { + "epoch": 0.04552715654952077, + "grad_norm": 0.15046533942222595, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 57 + }, + { + "epoch": 0.0463258785942492, + "grad_norm": 0.1341097205877304, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 58 + }, + { + "epoch": 0.04712460063897764, + "grad_norm": 0.20391245186328888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 59 + }, + { + "epoch": 0.04792332268370607, + "grad_norm": 0.09610722959041595, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 60 + }, + { + "epoch": 0.048722044728434506, + "grad_norm": 0.09877557307481766, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 61 + }, + { + "epoch": 0.04952076677316294, + "grad_norm": 0.16971156001091003, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 62 + }, + { + "epoch": 0.050319488817891375, + "grad_norm": 0.1819174885749817, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 63 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.13067278265953064, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 64 + }, + { + "epoch": 0.051916932907348244, + "grad_norm": 0.10557633638381958, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 65 + }, + { + "epoch": 0.052715654952076675, + "grad_norm": 0.08713806420564651, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 66 + }, + { + "epoch": 0.05351437699680511, + "grad_norm": 0.12453104555606842, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 67 + }, + { + "epoch": 0.054313099041533544, + "grad_norm": 0.19147996604442596, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 68 + }, + { + "epoch": 0.05511182108626198, + "grad_norm": 0.21808673441410065, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 69 + }, + { + "epoch": 0.05591054313099041, + "grad_norm": 0.15922780334949493, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 70 + }, + { + "epoch": 0.05670926517571885, + "grad_norm": 0.09400095790624619, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 71 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 0.071605384349823, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 72 + }, + { + "epoch": 0.05830670926517572, + "grad_norm": 0.08754080533981323, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 73 + }, + { + "epoch": 0.05910543130990415, + "grad_norm": 0.07777409255504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 74 + }, + { + "epoch": 0.05990415335463259, + "grad_norm": 0.04577887803316116, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 75 + }, + { + "epoch": 0.06070287539936102, + "grad_norm": 0.07278449088335037, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 76 + }, + { + "epoch": 0.06150159744408946, + "grad_norm": 0.06739042699337006, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 77 + }, + { + "epoch": 0.06230031948881789, + "grad_norm": 0.06367938220500946, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 78 + }, + { + "epoch": 0.06309904153354633, + "grad_norm": 0.0551401786506176, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 79 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.04846199229359627, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 80 + }, + { + "epoch": 0.06469648562300319, + "grad_norm": 0.089615598320961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 81 + }, + { + "epoch": 0.06549520766773163, + "grad_norm": 0.19073566794395447, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 82 + }, + { + "epoch": 0.06629392971246006, + "grad_norm": 0.26971691846847534, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 83 + }, + { + "epoch": 0.0670926517571885, + "grad_norm": 0.3124604821205139, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 84 + }, + { + "epoch": 0.06789137380191693, + "grad_norm": 0.3448403775691986, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 85 + }, + { + "epoch": 0.06869009584664537, + "grad_norm": 0.2708166837692261, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 86 + }, + { + "epoch": 0.0694888178913738, + "grad_norm": 0.10507494956254959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 87 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 0.1015392392873764, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 88 + }, + { + "epoch": 0.07108626198083066, + "grad_norm": 0.34002622961997986, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 89 + }, + { + "epoch": 0.07188498402555911, + "grad_norm": 0.5238372683525085, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 90 + }, + { + "epoch": 0.07268370607028754, + "grad_norm": 0.5267866253852844, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 91 + }, + { + "epoch": 0.07348242811501597, + "grad_norm": 0.3286864757537842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 92 + }, + { + "epoch": 0.0742811501597444, + "grad_norm": 0.14270304143428802, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 93 + }, + { + "epoch": 0.07507987220447285, + "grad_norm": 0.3481365740299225, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 94 + }, + { + "epoch": 0.07587859424920128, + "grad_norm": 0.33883902430534363, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 95 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.2553725838661194, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 96 + }, + { + "epoch": 0.07747603833865814, + "grad_norm": 0.21944141387939453, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 97 + }, + { + "epoch": 0.07827476038338659, + "grad_norm": 0.18821558356285095, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 98 + }, + { + "epoch": 0.07907348242811502, + "grad_norm": 0.20073482394218445, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 99 + }, + { + "epoch": 0.07987220447284345, + "grad_norm": 0.2643139958381653, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 100 + }, + { + "epoch": 0.08067092651757188, + "grad_norm": 0.1843930184841156, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 101 + }, + { + "epoch": 0.08146964856230032, + "grad_norm": 0.12745684385299683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 102 + }, + { + "epoch": 0.08226837060702875, + "grad_norm": 0.3252592384815216, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 103 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 0.33775797486305237, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 104 + }, + { + "epoch": 0.08386581469648563, + "grad_norm": 0.24846483767032623, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 105 + }, + { + "epoch": 0.08466453674121406, + "grad_norm": 0.1598653495311737, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 106 + }, + { + "epoch": 0.08546325878594249, + "grad_norm": 0.2555698752403259, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 107 + }, + { + "epoch": 0.08626198083067092, + "grad_norm": 0.3770487308502197, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 108 + }, + { + "epoch": 0.08706070287539937, + "grad_norm": 0.3179391026496887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 109 + }, + { + "epoch": 0.0878594249201278, + "grad_norm": 0.11638858914375305, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 110 + }, + { + "epoch": 0.08865814696485623, + "grad_norm": 0.20365215837955475, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 111 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.22354111075401306, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 112 + }, + { + "epoch": 0.0902555910543131, + "grad_norm": 0.1944236010313034, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 113 + }, + { + "epoch": 0.09105431309904154, + "grad_norm": 0.16177603602409363, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 114 + }, + { + "epoch": 0.09185303514376997, + "grad_norm": 0.06650812178850174, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 115 + }, + { + "epoch": 0.0926517571884984, + "grad_norm": 0.20236945152282715, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 116 + }, + { + "epoch": 0.09345047923322684, + "grad_norm": 0.19086670875549316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 117 + }, + { + "epoch": 0.09424920127795527, + "grad_norm": 0.17380473017692566, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 118 + }, + { + "epoch": 0.0950479233226837, + "grad_norm": 0.11360115557909012, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 119 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 0.09359298646450043, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 120 + }, + { + "epoch": 0.09664536741214058, + "grad_norm": 0.15317411720752716, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 121 + }, + { + "epoch": 0.09744408945686901, + "grad_norm": 0.05564137175679207, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 122 + }, + { + "epoch": 0.09824281150159744, + "grad_norm": 0.13476046919822693, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 123 + }, + { + "epoch": 0.09904153354632587, + "grad_norm": 0.11372318118810654, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 124 + }, + { + "epoch": 0.09984025559105432, + "grad_norm": 0.11330179125070572, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 125 + }, + { + "epoch": 0.10063897763578275, + "grad_norm": 0.11304716765880585, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 126 + }, + { + "epoch": 0.10143769968051118, + "grad_norm": 0.06369871646165848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 127 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.14034464955329895, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 128 + }, + { + "epoch": 0.10303514376996806, + "grad_norm": 0.1080808937549591, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 129 + }, + { + "epoch": 0.10383386581469649, + "grad_norm": 0.09568007290363312, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 130 + }, + { + "epoch": 0.10463258785942492, + "grad_norm": 0.1359473019838333, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 131 + }, + { + "epoch": 0.10543130990415335, + "grad_norm": 0.06500346213579178, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 132 + }, + { + "epoch": 0.1062300319488818, + "grad_norm": 0.11564832180738449, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 133 + }, + { + "epoch": 0.10702875399361023, + "grad_norm": 0.2115149199962616, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 134 + }, + { + "epoch": 0.10782747603833866, + "grad_norm": 0.3098243772983551, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 135 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 0.446521133184433, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 136 + }, + { + "epoch": 0.10942492012779553, + "grad_norm": 0.5194831490516663, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 137 + }, + { + "epoch": 0.11022364217252396, + "grad_norm": 0.447731077671051, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 138 + }, + { + "epoch": 0.1110223642172524, + "grad_norm": 0.2195945680141449, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 139 + }, + { + "epoch": 0.11182108626198083, + "grad_norm": 0.1277567446231842, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 140 + }, + { + "epoch": 0.11261980830670927, + "grad_norm": 0.3284558355808258, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 141 + }, + { + "epoch": 0.1134185303514377, + "grad_norm": 0.40208715200424194, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 142 + }, + { + "epoch": 0.11421725239616613, + "grad_norm": 0.28310486674308777, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 143 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.0786294937133789, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 144 + }, + { + "epoch": 0.11581469648562301, + "grad_norm": 0.18283484876155853, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 145 + }, + { + "epoch": 0.11661341853035144, + "grad_norm": 0.20186439156532288, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 146 + }, + { + "epoch": 0.11741214057507987, + "grad_norm": 0.15860706567764282, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 147 + }, + { + "epoch": 0.1182108626198083, + "grad_norm": 0.1436982899904251, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 148 + }, + { + "epoch": 0.11900958466453675, + "grad_norm": 0.15206722915172577, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 149 + }, + { + "epoch": 0.11980830670926518, + "grad_norm": 0.252279132604599, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 150 + }, + { + "epoch": 0.12060702875399361, + "grad_norm": 0.19411228597164154, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 151 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 0.07377714663743973, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 152 + }, + { + "epoch": 0.12220447284345048, + "grad_norm": 0.15493856370449066, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 153 + }, + { + "epoch": 0.12300319488817892, + "grad_norm": 0.275601863861084, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 154 + }, + { + "epoch": 0.12380191693290735, + "grad_norm": 0.42461103200912476, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 155 + }, + { + "epoch": 0.12460063897763578, + "grad_norm": 0.41153159737586975, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 156 + }, + { + "epoch": 0.1253993610223642, + "grad_norm": 0.2487967610359192, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 157 + }, + { + "epoch": 0.12619808306709265, + "grad_norm": 0.10687623918056488, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 158 + }, + { + "epoch": 0.1269968051118211, + "grad_norm": 0.28695282340049744, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 159 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.38554099202156067, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 160 + }, + { + "epoch": 0.12859424920127796, + "grad_norm": 0.25622498989105225, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 161 + }, + { + "epoch": 0.12939297124600638, + "grad_norm": 0.10341542959213257, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 162 + }, + { + "epoch": 0.13019169329073482, + "grad_norm": 0.20450755953788757, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 163 + }, + { + "epoch": 0.13099041533546327, + "grad_norm": 0.2664271295070648, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 164 + }, + { + "epoch": 0.13178913738019168, + "grad_norm": 0.23936089873313904, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 165 + }, + { + "epoch": 0.13258785942492013, + "grad_norm": 0.0662769302725792, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 166 + }, + { + "epoch": 0.13338658146964857, + "grad_norm": 0.13597780466079712, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 167 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 0.15996500849723816, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 168 + }, + { + "epoch": 0.13498402555910544, + "grad_norm": 0.10095447301864624, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 169 + }, + { + "epoch": 0.13578274760383385, + "grad_norm": 0.09733449667692184, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 170 + }, + { + "epoch": 0.1365814696485623, + "grad_norm": 0.16480964422225952, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 171 + }, + { + "epoch": 0.13738019169329074, + "grad_norm": 0.21611596643924713, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 172 + }, + { + "epoch": 0.13817891373801916, + "grad_norm": 0.21607941389083862, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 173 + }, + { + "epoch": 0.1389776357827476, + "grad_norm": 0.2234959453344345, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 174 + }, + { + "epoch": 0.13977635782747605, + "grad_norm": 0.10778137296438217, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 175 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.1758418083190918, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 176 + }, + { + "epoch": 0.1413738019169329, + "grad_norm": 0.30717936158180237, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 177 + }, + { + "epoch": 0.14217252396166133, + "grad_norm": 0.3382156789302826, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 178 + }, + { + "epoch": 0.14297124600638977, + "grad_norm": 0.23189185559749603, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 179 + }, + { + "epoch": 0.14376996805111822, + "grad_norm": 0.04988733306527138, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 180 + }, + { + "epoch": 0.14456869009584664, + "grad_norm": 0.15606579184532166, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 181 + }, + { + "epoch": 0.14536741214057508, + "grad_norm": 0.2366417795419693, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 182 + }, + { + "epoch": 0.14616613418530353, + "grad_norm": 0.21878089010715485, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 183 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 0.09316077083349228, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 184 + }, + { + "epoch": 0.1477635782747604, + "grad_norm": 0.119263656437397, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 185 + }, + { + "epoch": 0.1485623003194888, + "grad_norm": 0.26743847131729126, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 186 + }, + { + "epoch": 0.14936102236421725, + "grad_norm": 0.34438276290893555, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 187 + }, + { + "epoch": 0.1501597444089457, + "grad_norm": 0.30809128284454346, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 188 + }, + { + "epoch": 0.1509584664536741, + "grad_norm": 0.1406010240316391, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 189 + }, + { + "epoch": 0.15175718849840256, + "grad_norm": 0.09509757161140442, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 190 + }, + { + "epoch": 0.152555910543131, + "grad_norm": 0.24529854953289032, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 191 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.2803219258785248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 192 + }, + { + "epoch": 0.15415335463258786, + "grad_norm": 0.18221652507781982, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 193 + }, + { + "epoch": 0.15495207667731628, + "grad_norm": 0.04752795770764351, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 194 + }, + { + "epoch": 0.15575079872204473, + "grad_norm": 0.14151020348072052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 195 + }, + { + "epoch": 0.15654952076677317, + "grad_norm": 0.27345412969589233, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 196 + }, + { + "epoch": 0.1573482428115016, + "grad_norm": 0.36259710788726807, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 197 + }, + { + "epoch": 0.15814696485623003, + "grad_norm": 0.30899694561958313, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 198 + }, + { + "epoch": 0.15894568690095848, + "grad_norm": 0.148394376039505, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 199 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 0.09150427579879761, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 200 + }, + { + "epoch": 0.16054313099041534, + "grad_norm": 0.2579229176044464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 201 + }, + { + "epoch": 0.16134185303514376, + "grad_norm": 0.35417553782463074, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 202 + }, + { + "epoch": 0.1621405750798722, + "grad_norm": 0.3410634994506836, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 203 + }, + { + "epoch": 0.16293929712460065, + "grad_norm": 0.20597697794437408, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 204 + }, + { + "epoch": 0.16373801916932906, + "grad_norm": 0.09722702950239182, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 205 + }, + { + "epoch": 0.1645367412140575, + "grad_norm": 0.29214075207710266, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 206 + }, + { + "epoch": 0.16533546325878595, + "grad_norm": 0.35695526003837585, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 207 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.23948919773101807, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 208 + }, + { + "epoch": 0.16693290734824281, + "grad_norm": 0.06467479467391968, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 209 + }, + { + "epoch": 0.16773162939297126, + "grad_norm": 0.2935601472854614, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 210 + }, + { + "epoch": 0.16853035143769968, + "grad_norm": 0.3354688882827759, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 211 + }, + { + "epoch": 0.16932907348242812, + "grad_norm": 0.206736221909523, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 212 + }, + { + "epoch": 0.17012779552715654, + "grad_norm": 0.04770192503929138, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 213 + }, + { + "epoch": 0.17092651757188498, + "grad_norm": 0.11713571101427078, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 214 + }, + { + "epoch": 0.17172523961661343, + "grad_norm": 0.1751943975687027, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 215 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 0.11709283292293549, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 216 + }, + { + "epoch": 0.1733226837060703, + "grad_norm": 0.08393140882253647, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 217 + }, + { + "epoch": 0.17412140575079874, + "grad_norm": 0.14036497473716736, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 218 + }, + { + "epoch": 0.17492012779552715, + "grad_norm": 0.19809649884700775, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 219 + }, + { + "epoch": 0.1757188498402556, + "grad_norm": 0.16380994021892548, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 220 + }, + { + "epoch": 0.17651757188498401, + "grad_norm": 0.03721015155315399, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 221 + }, + { + "epoch": 0.17731629392971246, + "grad_norm": 0.16769659519195557, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 222 + }, + { + "epoch": 0.1781150159744409, + "grad_norm": 0.2506882846355438, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 223 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 0.2812851667404175, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 224 + }, + { + "epoch": 0.17971246006389777, + "grad_norm": 0.2518095374107361, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 225 + }, + { + "epoch": 0.1805111821086262, + "grad_norm": 0.13027259707450867, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 226 + }, + { + "epoch": 0.18130990415335463, + "grad_norm": 0.051758985966444016, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 227 + }, + { + "epoch": 0.18210862619808307, + "grad_norm": 0.123250812292099, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 228 + }, + { + "epoch": 0.1829073482428115, + "grad_norm": 0.16475827991962433, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 229 + }, + { + "epoch": 0.18370607028753994, + "grad_norm": 0.15224772691726685, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 230 + }, + { + "epoch": 0.18450479233226838, + "grad_norm": 0.10693283379077911, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 231 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 0.059128716588020325, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 232 + }, + { + "epoch": 0.18610223642172524, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 233 + }, + { + "epoch": 0.1869009584664537, + "grad_norm": 0.21447211503982544, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 234 + }, + { + "epoch": 0.1876996805111821, + "grad_norm": 0.214809849858284, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 235 + }, + { + "epoch": 0.18849840255591055, + "grad_norm": 0.16398873925209045, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 236 + }, + { + "epoch": 0.18929712460063897, + "grad_norm": 0.08273304253816605, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 237 + }, + { + "epoch": 0.1900958466453674, + "grad_norm": 0.08456159383058548, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 238 + }, + { + "epoch": 0.19089456869009586, + "grad_norm": 0.09653522819280624, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 239 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.13169406354427338, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 240 + }, + { + "epoch": 0.19249201277955272, + "grad_norm": 0.2328217476606369, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 241 + }, + { + "epoch": 0.19329073482428116, + "grad_norm": 0.2226463258266449, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 242 + }, + { + "epoch": 0.19408945686900958, + "grad_norm": 0.13330090045928955, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 243 + }, + { + "epoch": 0.19488817891373802, + "grad_norm": 0.15685412287712097, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 244 + }, + { + "epoch": 0.19568690095846644, + "grad_norm": 0.1528809666633606, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 245 + }, + { + "epoch": 0.1964856230031949, + "grad_norm": 0.2380320429801941, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 246 + }, + { + "epoch": 0.19728434504792333, + "grad_norm": 0.20447947084903717, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 247 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 0.162733793258667, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 248 + }, + { + "epoch": 0.1988817891373802, + "grad_norm": 0.10536827147006989, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 249 + }, + { + "epoch": 0.19968051118210864, + "grad_norm": 0.05464514344930649, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 250 + }, + { + "epoch": 0.20047923322683706, + "grad_norm": 0.052793700248003006, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 251 + }, + { + "epoch": 0.2012779552715655, + "grad_norm": 0.06936854124069214, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 252 + }, + { + "epoch": 0.20207667731629392, + "grad_norm": 0.17630355060100555, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 253 + }, + { + "epoch": 0.20287539936102236, + "grad_norm": 0.23443830013275146, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 254 + }, + { + "epoch": 0.2036741214057508, + "grad_norm": 0.21788854897022247, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 255 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.16827379167079926, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 256 + }, + { + "epoch": 0.20527156549520767, + "grad_norm": 0.08467451483011246, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 257 + }, + { + "epoch": 0.20607028753993611, + "grad_norm": 0.17747341096401215, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 258 + }, + { + "epoch": 0.20686900958466453, + "grad_norm": 0.20212751626968384, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 259 + }, + { + "epoch": 0.20766773162939298, + "grad_norm": 0.13319599628448486, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 260 + }, + { + "epoch": 0.2084664536741214, + "grad_norm": 0.13839752972126007, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 261 + }, + { + "epoch": 0.20926517571884984, + "grad_norm": 0.12351422011852264, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 262 + }, + { + "epoch": 0.21006389776357828, + "grad_norm": 0.1166408434510231, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 263 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 0.15500681102275848, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 264 + }, + { + "epoch": 0.21166134185303515, + "grad_norm": 0.045156076550483704, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 265 + }, + { + "epoch": 0.2124600638977636, + "grad_norm": 0.1413601189851761, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 266 + }, + { + "epoch": 0.213258785942492, + "grad_norm": 0.19309845566749573, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 267 + }, + { + "epoch": 0.21405750798722045, + "grad_norm": 0.22837650775909424, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 268 + }, + { + "epoch": 0.21485623003194887, + "grad_norm": 0.23372405767440796, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 269 + }, + { + "epoch": 0.21565495207667731, + "grad_norm": 0.2030618041753769, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 270 + }, + { + "epoch": 0.21645367412140576, + "grad_norm": 0.2092818021774292, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 271 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.18329963088035583, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 272 + }, + { + "epoch": 0.21805111821086262, + "grad_norm": 0.07353675365447998, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 273 + }, + { + "epoch": 0.21884984025559107, + "grad_norm": 0.08853492140769958, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 274 + }, + { + "epoch": 0.21964856230031948, + "grad_norm": 0.14666804671287537, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 275 + }, + { + "epoch": 0.22044728434504793, + "grad_norm": 0.12529602646827698, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 276 + }, + { + "epoch": 0.22124600638977635, + "grad_norm": 0.1571074277162552, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 277 + }, + { + "epoch": 0.2220447284345048, + "grad_norm": 0.09636949002742767, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 278 + }, + { + "epoch": 0.22284345047923323, + "grad_norm": 0.16803453862667084, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 279 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 0.258849561214447, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 280 + }, + { + "epoch": 0.2244408945686901, + "grad_norm": 0.29162102937698364, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 281 + }, + { + "epoch": 0.22523961661341854, + "grad_norm": 0.32085782289505005, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 282 + }, + { + "epoch": 0.22603833865814696, + "grad_norm": 0.24114084243774414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 283 + }, + { + "epoch": 0.2268370607028754, + "grad_norm": 0.11804991215467453, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 284 + }, + { + "epoch": 0.22763578274760382, + "grad_norm": 0.16640789806842804, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 285 + }, + { + "epoch": 0.22843450479233227, + "grad_norm": 0.33951282501220703, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 286 + }, + { + "epoch": 0.2292332268370607, + "grad_norm": 0.3939269483089447, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 287 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.2742229402065277, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 288 + }, + { + "epoch": 0.23083067092651757, + "grad_norm": 0.1000385507941246, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 289 + }, + { + "epoch": 0.23162939297124602, + "grad_norm": 0.15618765354156494, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 290 + }, + { + "epoch": 0.23242811501597443, + "grad_norm": 0.3464474081993103, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 291 + }, + { + "epoch": 0.23322683706070288, + "grad_norm": 0.4524421989917755, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 292 + }, + { + "epoch": 0.2340255591054313, + "grad_norm": 0.38890203833580017, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 293 + }, + { + "epoch": 0.23482428115015974, + "grad_norm": 0.15225796401500702, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 294 + }, + { + "epoch": 0.2356230031948882, + "grad_norm": 0.18742015957832336, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 295 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 0.454607754945755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 296 + }, + { + "epoch": 0.23722044728434505, + "grad_norm": 0.4426102638244629, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 297 + }, + { + "epoch": 0.2380191693290735, + "grad_norm": 0.1442587673664093, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 298 + }, + { + "epoch": 0.2388178913738019, + "grad_norm": 0.2338172197341919, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 299 + }, + { + "epoch": 0.23961661341853036, + "grad_norm": 0.4115936756134033, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 300 + }, + { + "epoch": 0.24041533546325877, + "grad_norm": 0.38746342062950134, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 301 + }, + { + "epoch": 0.24121405750798722, + "grad_norm": 0.11506912112236023, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 302 + }, + { + "epoch": 0.24201277955271566, + "grad_norm": 0.20454810559749603, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 303 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.34620603919029236, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 304 + }, + { + "epoch": 0.24361022364217252, + "grad_norm": 0.27727624773979187, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 305 + }, + { + "epoch": 0.24440894568690097, + "grad_norm": 0.062395140528678894, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 306 + }, + { + "epoch": 0.2452076677316294, + "grad_norm": 0.25391891598701477, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 307 + }, + { + "epoch": 0.24600638977635783, + "grad_norm": 0.3807840049266815, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 308 + }, + { + "epoch": 0.24680511182108625, + "grad_norm": 0.31564414501190186, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 309 + }, + { + "epoch": 0.2476038338658147, + "grad_norm": 0.044667672365903854, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 310 + }, + { + "epoch": 0.24840255591054314, + "grad_norm": 0.2656041979789734, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 311 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 0.2954655587673187, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 312 + }, + { + "epoch": 0.25, + "grad_norm": 0.14636820554733276, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 313 + }, + { + "epoch": 0.2507987220447284, + "grad_norm": 0.16759099066257477, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 314 + }, + { + "epoch": 0.2515974440894569, + "grad_norm": 0.28777605295181274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 315 + }, + { + "epoch": 0.2523961661341853, + "grad_norm": 0.2817089855670929, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 316 + }, + { + "epoch": 0.2531948881789137, + "grad_norm": 0.09457004815340042, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 317 + }, + { + "epoch": 0.2539936102236422, + "grad_norm": 0.15224558115005493, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 318 + }, + { + "epoch": 0.2547923322683706, + "grad_norm": 0.17883236706256866, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 319 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.08269336074590683, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 320 + }, + { + "epoch": 0.2563897763578275, + "grad_norm": 0.10430650413036346, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 321 + }, + { + "epoch": 0.2571884984025559, + "grad_norm": 0.06464210897684097, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 322 + }, + { + "epoch": 0.25798722044728434, + "grad_norm": 0.08100844919681549, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 323 + }, + { + "epoch": 0.25878594249201275, + "grad_norm": 0.10375291109085083, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 324 + }, + { + "epoch": 0.2595846645367412, + "grad_norm": 0.14621509611606598, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 325 + }, + { + "epoch": 0.26038338658146964, + "grad_norm": 0.12707975506782532, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 326 + }, + { + "epoch": 0.26118210862619806, + "grad_norm": 0.04542430862784386, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 327 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 0.13504259288311005, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 328 + }, + { + "epoch": 0.26277955271565495, + "grad_norm": 0.20337320864200592, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 329 + }, + { + "epoch": 0.26357827476038337, + "grad_norm": 0.23682020604610443, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 330 + }, + { + "epoch": 0.26437699680511184, + "grad_norm": 0.15198387205600739, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 331 + }, + { + "epoch": 0.26517571884984026, + "grad_norm": 0.04014969989657402, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 332 + }, + { + "epoch": 0.2659744408945687, + "grad_norm": 0.10505357384681702, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 333 + }, + { + "epoch": 0.26677316293929715, + "grad_norm": 0.08121145516633987, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 334 + }, + { + "epoch": 0.26757188498402557, + "grad_norm": 0.062118109315633774, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 335 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.13389311730861664, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 336 + }, + { + "epoch": 0.26916932907348246, + "grad_norm": 0.24840199947357178, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 337 + }, + { + "epoch": 0.26996805111821087, + "grad_norm": 0.33511659502983093, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 338 + }, + { + "epoch": 0.2707667731629393, + "grad_norm": 0.2905866801738739, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 339 + }, + { + "epoch": 0.2715654952076677, + "grad_norm": 0.15471668541431427, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 340 + }, + { + "epoch": 0.2723642172523962, + "grad_norm": 0.09973842650651932, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 341 + }, + { + "epoch": 0.2731629392971246, + "grad_norm": 0.19315758347511292, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 342 + }, + { + "epoch": 0.273961661341853, + "grad_norm": 0.2122231423854828, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 343 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 0.11207931488752365, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 344 + }, + { + "epoch": 0.2755591054313099, + "grad_norm": 0.11863203346729279, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 345 + }, + { + "epoch": 0.2763578274760383, + "grad_norm": 0.22022183239459991, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 346 + }, + { + "epoch": 0.2771565495207668, + "grad_norm": 0.225724458694458, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 347 + }, + { + "epoch": 0.2779552715654952, + "grad_norm": 0.1622191071510315, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 348 + }, + { + "epoch": 0.2787539936102236, + "grad_norm": 0.05987359210848808, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 349 + }, + { + "epoch": 0.2795527156549521, + "grad_norm": 0.08514829725027084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 350 + }, + { + "epoch": 0.2803514376996805, + "grad_norm": 0.10734611004590988, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 351 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.12458663433790207, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 352 + }, + { + "epoch": 0.2819488817891374, + "grad_norm": 0.12223048508167267, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 353 + }, + { + "epoch": 0.2827476038338658, + "grad_norm": 0.0663333311676979, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 354 + }, + { + "epoch": 0.28354632587859424, + "grad_norm": 0.0628359317779541, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 355 + }, + { + "epoch": 0.28434504792332266, + "grad_norm": 0.1566074788570404, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 356 + }, + { + "epoch": 0.28514376996805113, + "grad_norm": 0.23291122913360596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 357 + }, + { + "epoch": 0.28594249201277955, + "grad_norm": 0.21403467655181885, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 358 + }, + { + "epoch": 0.28674121405750796, + "grad_norm": 0.08412498980760574, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 359 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 0.1415901631116867, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 360 + }, + { + "epoch": 0.28833865814696485, + "grad_norm": 0.29960349202156067, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 361 + }, + { + "epoch": 0.28913738019169327, + "grad_norm": 0.33849450945854187, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 362 + }, + { + "epoch": 0.28993610223642174, + "grad_norm": 0.24428068101406097, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 363 + }, + { + "epoch": 0.29073482428115016, + "grad_norm": 0.07897785305976868, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 364 + }, + { + "epoch": 0.2915335463258786, + "grad_norm": 0.1347426027059555, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 365 + }, + { + "epoch": 0.29233226837060705, + "grad_norm": 0.21387724578380585, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 366 + }, + { + "epoch": 0.29313099041533547, + "grad_norm": 0.13869348168373108, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 367 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.062060993164777756, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 368 + }, + { + "epoch": 0.29472843450479236, + "grad_norm": 0.13848915696144104, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 369 + }, + { + "epoch": 0.2955271565495208, + "grad_norm": 0.12179117649793625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 370 + }, + { + "epoch": 0.2963258785942492, + "grad_norm": 0.13039280474185944, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 371 + }, + { + "epoch": 0.2971246006389776, + "grad_norm": 0.09119348227977753, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 372 + }, + { + "epoch": 0.2979233226837061, + "grad_norm": 0.06374438107013702, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 373 + }, + { + "epoch": 0.2987220447284345, + "grad_norm": 0.1524113267660141, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 374 + }, + { + "epoch": 0.2995207667731629, + "grad_norm": 0.18103912472724915, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 375 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 0.1439986377954483, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 376 + }, + { + "epoch": 0.3011182108626198, + "grad_norm": 0.1268371045589447, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 377 + }, + { + "epoch": 0.3019169329073482, + "grad_norm": 0.07370569556951523, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 378 + }, + { + "epoch": 0.3027156549520767, + "grad_norm": 0.0718536451458931, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 379 + }, + { + "epoch": 0.3035143769968051, + "grad_norm": 0.10444384068250656, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 380 + }, + { + "epoch": 0.30431309904153353, + "grad_norm": 0.10085552930831909, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 381 + }, + { + "epoch": 0.305111821086262, + "grad_norm": 0.08599484711885452, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 382 + }, + { + "epoch": 0.3059105431309904, + "grad_norm": 0.08912923187017441, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 383 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.17919759452342987, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 384 + }, + { + "epoch": 0.3075079872204473, + "grad_norm": 0.23954501748085022, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 385 + }, + { + "epoch": 0.3083067092651757, + "grad_norm": 0.2940942645072937, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 386 + }, + { + "epoch": 0.30910543130990414, + "grad_norm": 0.2905970513820648, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 387 + }, + { + "epoch": 0.30990415335463256, + "grad_norm": 0.2555491626262665, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 388 + }, + { + "epoch": 0.31070287539936103, + "grad_norm": 0.15303272008895874, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 389 + }, + { + "epoch": 0.31150159744408945, + "grad_norm": 0.10148895531892776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 390 + }, + { + "epoch": 0.31230031948881787, + "grad_norm": 0.21828792989253998, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 391 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 0.27219685912132263, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 392 + }, + { + "epoch": 0.31389776357827476, + "grad_norm": 0.3431699872016907, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 393 + }, + { + "epoch": 0.3146964856230032, + "grad_norm": 0.32346805930137634, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 394 + }, + { + "epoch": 0.31549520766773165, + "grad_norm": 0.17791730165481567, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 395 + }, + { + "epoch": 0.31629392971246006, + "grad_norm": 0.09576063603162766, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 396 + }, + { + "epoch": 0.3170926517571885, + "grad_norm": 0.050598498433828354, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 397 + }, + { + "epoch": 0.31789137380191695, + "grad_norm": 0.07385009527206421, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 398 + }, + { + "epoch": 0.31869009584664537, + "grad_norm": 0.08680527657270432, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 399 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.06436332315206528, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 400 + }, + { + "epoch": 0.32028753993610226, + "grad_norm": 0.05943639203906059, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 401 + }, + { + "epoch": 0.3210862619808307, + "grad_norm": 0.10015929490327835, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 402 + }, + { + "epoch": 0.3218849840255591, + "grad_norm": 0.07852698862552643, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 403 + }, + { + "epoch": 0.3226837060702875, + "grad_norm": 0.06103534996509552, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 404 + }, + { + "epoch": 0.323482428115016, + "grad_norm": 0.04573113098740578, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 405 + }, + { + "epoch": 0.3242811501597444, + "grad_norm": 0.06108849495649338, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 406 + }, + { + "epoch": 0.3250798722044728, + "grad_norm": 0.10209841281175613, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 407 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 0.0956021398305893, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 408 + }, + { + "epoch": 0.3266773162939297, + "grad_norm": 0.12572422623634338, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 409 + }, + { + "epoch": 0.3274760383386581, + "grad_norm": 0.1532585173845291, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 410 + }, + { + "epoch": 0.3282747603833866, + "grad_norm": 0.10664337128400803, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 411 + }, + { + "epoch": 0.329073482428115, + "grad_norm": 0.07705336064100266, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 412 + }, + { + "epoch": 0.32987220447284343, + "grad_norm": 0.08611477166414261, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 413 + }, + { + "epoch": 0.3306709265175719, + "grad_norm": 0.11460789293050766, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 414 + }, + { + "epoch": 0.3314696485623003, + "grad_norm": 0.1214505136013031, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 415 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.07482243329286575, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 416 + }, + { + "epoch": 0.3330670926517572, + "grad_norm": 0.05022026225924492, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 417 + }, + { + "epoch": 0.33386581469648563, + "grad_norm": 0.086161769926548, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 418 + }, + { + "epoch": 0.33466453674121405, + "grad_norm": 0.05073339864611626, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 419 + }, + { + "epoch": 0.3354632587859425, + "grad_norm": 0.0925290584564209, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 420 + }, + { + "epoch": 0.33626198083067094, + "grad_norm": 0.08073565363883972, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 421 + }, + { + "epoch": 0.33706070287539935, + "grad_norm": 0.06067343428730965, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 422 + }, + { + "epoch": 0.33785942492012777, + "grad_norm": 0.16081079840660095, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 423 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 0.3043743371963501, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 424 + }, + { + "epoch": 0.33945686900958466, + "grad_norm": 0.32498979568481445, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 425 + }, + { + "epoch": 0.3402555910543131, + "grad_norm": 0.206096351146698, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 426 + }, + { + "epoch": 0.34105431309904155, + "grad_norm": 0.11892937123775482, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 427 + }, + { + "epoch": 0.34185303514376997, + "grad_norm": 0.19896888732910156, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 428 + }, + { + "epoch": 0.3426517571884984, + "grad_norm": 0.3295411169528961, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 429 + }, + { + "epoch": 0.34345047923322686, + "grad_norm": 0.3841599225997925, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 430 + }, + { + "epoch": 0.3442492012779553, + "grad_norm": 0.36113840341567993, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 431 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.25694623589515686, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 432 + }, + { + "epoch": 0.34584664536741216, + "grad_norm": 0.07741750776767731, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 433 + }, + { + "epoch": 0.3466453674121406, + "grad_norm": 0.1385476440191269, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 434 + }, + { + "epoch": 0.347444089456869, + "grad_norm": 0.22972947359085083, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 435 + }, + { + "epoch": 0.34824281150159747, + "grad_norm": 0.15720337629318237, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 436 + }, + { + "epoch": 0.3490415335463259, + "grad_norm": 0.04451138526201248, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 437 + }, + { + "epoch": 0.3498402555910543, + "grad_norm": 0.15054486691951752, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 438 + }, + { + "epoch": 0.3506389776357827, + "grad_norm": 0.16740895807743073, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 439 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 0.1388419270515442, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 440 + }, + { + "epoch": 0.3522364217252396, + "grad_norm": 0.06480700522661209, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 441 + }, + { + "epoch": 0.35303514376996803, + "grad_norm": 0.09604794532060623, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 442 + }, + { + "epoch": 0.3538338658146965, + "grad_norm": 0.174916610121727, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.3546325878594249, + "grad_norm": 0.2228047251701355, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 444 + }, + { + "epoch": 0.35543130990415334, + "grad_norm": 0.24461773037910461, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 445 + }, + { + "epoch": 0.3562300319488818, + "grad_norm": 0.2201017141342163, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 446 + }, + { + "epoch": 0.3570287539936102, + "grad_norm": 0.11596337705850601, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 447 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.1682164967060089, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 448 + }, + { + "epoch": 0.3586261980830671, + "grad_norm": 0.4297041594982147, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 449 + }, + { + "epoch": 0.35942492012779553, + "grad_norm": 0.5659548044204712, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 450 + }, + { + "epoch": 0.36022364217252395, + "grad_norm": 0.5303114652633667, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 451 + }, + { + "epoch": 0.3610223642172524, + "grad_norm": 0.23788955807685852, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 452 + }, + { + "epoch": 0.36182108626198084, + "grad_norm": 0.15622566640377045, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 453 + }, + { + "epoch": 0.36261980830670926, + "grad_norm": 0.327275812625885, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 454 + }, + { + "epoch": 0.3634185303514377, + "grad_norm": 0.23511037230491638, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 455 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 0.11690831184387207, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 456 + }, + { + "epoch": 0.36501597444089456, + "grad_norm": 0.17950886487960815, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 457 + }, + { + "epoch": 0.365814696485623, + "grad_norm": 0.13816051185131073, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 458 + }, + { + "epoch": 0.36661341853035145, + "grad_norm": 0.09056458622217178, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 459 + }, + { + "epoch": 0.36741214057507987, + "grad_norm": 0.1648412048816681, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 460 + }, + { + "epoch": 0.3682108626198083, + "grad_norm": 0.24407249689102173, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 461 + }, + { + "epoch": 0.36900958466453676, + "grad_norm": 0.1896992176771164, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 462 + }, + { + "epoch": 0.3698083067092652, + "grad_norm": 0.07938385009765625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 463 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.10241381078958511, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 464 + }, + { + "epoch": 0.37140575079872207, + "grad_norm": 0.14765797555446625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 465 + }, + { + "epoch": 0.3722044728434505, + "grad_norm": 0.11189796775579453, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 466 + }, + { + "epoch": 0.3730031948881789, + "grad_norm": 0.05604114383459091, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 467 + }, + { + "epoch": 0.3738019169329074, + "grad_norm": 0.18633529543876648, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 468 + }, + { + "epoch": 0.3746006389776358, + "grad_norm": 0.2587120234966278, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 469 + }, + { + "epoch": 0.3753993610223642, + "grad_norm": 0.21629218757152557, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 470 + }, + { + "epoch": 0.3761980830670926, + "grad_norm": 0.11872006952762604, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 471 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 0.07732011377811432, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 472 + }, + { + "epoch": 0.3777955271565495, + "grad_norm": 0.20141537487506866, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 473 + }, + { + "epoch": 0.37859424920127793, + "grad_norm": 0.26726409792900085, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 474 + }, + { + "epoch": 0.3793929712460064, + "grad_norm": 0.2373354583978653, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 475 + }, + { + "epoch": 0.3801916932907348, + "grad_norm": 0.15030571818351746, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 476 + }, + { + "epoch": 0.38099041533546324, + "grad_norm": 0.05345006287097931, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 477 + }, + { + "epoch": 0.3817891373801917, + "grad_norm": 0.12551648914813995, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 478 + }, + { + "epoch": 0.38258785942492013, + "grad_norm": 0.14036186039447784, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 479 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.09807970374822617, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 480 + }, + { + "epoch": 0.384185303514377, + "grad_norm": 0.05071088671684265, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 481 + }, + { + "epoch": 0.38498402555910544, + "grad_norm": 0.07541649043560028, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 482 + }, + { + "epoch": 0.38578274760383385, + "grad_norm": 0.059762127697467804, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 483 + }, + { + "epoch": 0.3865814696485623, + "grad_norm": 0.05540496110916138, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 484 + }, + { + "epoch": 0.38738019169329074, + "grad_norm": 0.09137953072786331, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 485 + }, + { + "epoch": 0.38817891373801916, + "grad_norm": 0.1349237710237503, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 486 + }, + { + "epoch": 0.3889776357827476, + "grad_norm": 0.13889296352863312, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 487 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 0.16406965255737305, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 488 + }, + { + "epoch": 0.39057507987220447, + "grad_norm": 0.1748959869146347, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 489 + }, + { + "epoch": 0.3913738019169329, + "grad_norm": 0.1518068015575409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 490 + }, + { + "epoch": 0.39217252396166136, + "grad_norm": 0.06694433838129044, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 491 + }, + { + "epoch": 0.3929712460063898, + "grad_norm": 0.11556574702262878, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 492 + }, + { + "epoch": 0.3937699680511182, + "grad_norm": 0.2562897801399231, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 493 + }, + { + "epoch": 0.39456869009584666, + "grad_norm": 0.30842337012290955, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 494 + }, + { + "epoch": 0.3953674121405751, + "grad_norm": 0.30477815866470337, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 495 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.2602941691875458, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 496 + }, + { + "epoch": 0.39696485623003197, + "grad_norm": 0.1692838817834854, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 497 + }, + { + "epoch": 0.3977635782747604, + "grad_norm": 0.07468903064727783, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 498 + }, + { + "epoch": 0.3985623003194888, + "grad_norm": 0.05872616916894913, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 499 + }, + { + "epoch": 0.3993610223642173, + "grad_norm": 0.09878433495759964, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 500 + }, + { + "epoch": 0.4001597444089457, + "grad_norm": 0.13779069483280182, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 501 + }, + { + "epoch": 0.4009584664536741, + "grad_norm": 0.17778213322162628, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 502 + }, + { + "epoch": 0.40175718849840253, + "grad_norm": 0.15572750568389893, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 503 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 0.1154002770781517, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 504 + }, + { + "epoch": 0.4033546325878594, + "grad_norm": 0.04485362395644188, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 505 + }, + { + "epoch": 0.40415335463258784, + "grad_norm": 0.07514321058988571, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 506 + }, + { + "epoch": 0.4049520766773163, + "grad_norm": 0.13954220712184906, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 507 + }, + { + "epoch": 0.4057507987220447, + "grad_norm": 0.20726922154426575, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 508 + }, + { + "epoch": 0.40654952076677314, + "grad_norm": 0.28239160776138306, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 509 + }, + { + "epoch": 0.4073482428115016, + "grad_norm": 0.28484129905700684, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 510 + }, + { + "epoch": 0.40814696485623003, + "grad_norm": 0.28111377358436584, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 511 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.25087496638298035, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 512 + }, + { + "epoch": 0.4097444089456869, + "grad_norm": 0.1652008444070816, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 513 + }, + { + "epoch": 0.41054313099041534, + "grad_norm": 0.11345700174570084, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 514 + }, + { + "epoch": 0.41134185303514376, + "grad_norm": 0.1191159337759018, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 515 + }, + { + "epoch": 0.41214057507987223, + "grad_norm": 0.26302817463874817, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 516 + }, + { + "epoch": 0.41293929712460065, + "grad_norm": 0.3303217589855194, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 517 + }, + { + "epoch": 0.41373801916932906, + "grad_norm": 0.2874647378921509, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 518 + }, + { + "epoch": 0.4145367412140575, + "grad_norm": 0.23112182319164276, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 519 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 0.16285021603107452, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 520 + }, + { + "epoch": 0.41613418530351437, + "grad_norm": 0.08440099656581879, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 521 + }, + { + "epoch": 0.4169329073482428, + "grad_norm": 0.03578028455376625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 522 + }, + { + "epoch": 0.41773162939297126, + "grad_norm": 0.0995275005698204, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 523 + }, + { + "epoch": 0.4185303514376997, + "grad_norm": 0.17713160812854767, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 524 + }, + { + "epoch": 0.4193290734824281, + "grad_norm": 0.1685509830713272, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 525 + }, + { + "epoch": 0.42012779552715657, + "grad_norm": 0.11357919126749039, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 526 + }, + { + "epoch": 0.420926517571885, + "grad_norm": 0.059025365859270096, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 527 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.05128806456923485, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 528 + }, + { + "epoch": 0.4225239616613419, + "grad_norm": 0.05291247367858887, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 529 + }, + { + "epoch": 0.4233226837060703, + "grad_norm": 0.10755500197410583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 530 + }, + { + "epoch": 0.4241214057507987, + "grad_norm": 0.15659615397453308, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 531 + }, + { + "epoch": 0.4249201277955272, + "grad_norm": 0.19369953870773315, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 532 + }, + { + "epoch": 0.4257188498402556, + "grad_norm": 0.16491396725177765, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 533 + }, + { + "epoch": 0.426517571884984, + "grad_norm": 0.10276799649000168, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 534 + }, + { + "epoch": 0.4273162939297125, + "grad_norm": 0.06273368000984192, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 535 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 0.03896406292915344, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 536 + }, + { + "epoch": 0.4289137380191693, + "grad_norm": 0.08083273470401764, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 537 + }, + { + "epoch": 0.42971246006389774, + "grad_norm": 0.05107828602194786, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 538 + }, + { + "epoch": 0.4305111821086262, + "grad_norm": 0.04359392821788788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 539 + }, + { + "epoch": 0.43130990415335463, + "grad_norm": 0.04225402697920799, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 540 + }, + { + "epoch": 0.43210862619808305, + "grad_norm": 0.07523404061794281, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 541 + }, + { + "epoch": 0.4329073482428115, + "grad_norm": 0.07966417819261551, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 542 + }, + { + "epoch": 0.43370607028753994, + "grad_norm": 0.04529299959540367, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 543 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.0793156549334526, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 544 + }, + { + "epoch": 0.4353035143769968, + "grad_norm": 0.1533992737531662, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 545 + }, + { + "epoch": 0.43610223642172524, + "grad_norm": 0.2893797755241394, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 546 + }, + { + "epoch": 0.43690095846645366, + "grad_norm": 0.4145842492580414, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 547 + }, + { + "epoch": 0.43769968051118213, + "grad_norm": 0.4550987482070923, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 548 + }, + { + "epoch": 0.43849840255591055, + "grad_norm": 0.4318651556968689, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 549 + }, + { + "epoch": 0.43929712460063897, + "grad_norm": 0.35961681604385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 550 + }, + { + "epoch": 0.44009584664536744, + "grad_norm": 0.18606753647327423, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 551 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 0.12992478907108307, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 552 + }, + { + "epoch": 0.4416932907348243, + "grad_norm": 0.32936930656433105, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 553 + }, + { + "epoch": 0.4424920127795527, + "grad_norm": 0.3547491133213043, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 554 + }, + { + "epoch": 0.44329073482428116, + "grad_norm": 0.2144627720117569, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 555 + }, + { + "epoch": 0.4440894568690096, + "grad_norm": 0.07260395586490631, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 556 + }, + { + "epoch": 0.444888178913738, + "grad_norm": 0.19895662367343903, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 557 + }, + { + "epoch": 0.44568690095846647, + "grad_norm": 0.18664990365505219, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 558 + }, + { + "epoch": 0.4464856230031949, + "grad_norm": 0.11666610836982727, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 559 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.11163592338562012, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 560 + }, + { + "epoch": 0.4480830670926518, + "grad_norm": 0.1815878301858902, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 561 + }, + { + "epoch": 0.4488817891373802, + "grad_norm": 0.2593924105167389, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 562 + }, + { + "epoch": 0.4496805111821086, + "grad_norm": 0.20761220157146454, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 563 + }, + { + "epoch": 0.4504792332268371, + "grad_norm": 0.06589766591787338, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 564 + }, + { + "epoch": 0.4512779552715655, + "grad_norm": 0.21619920432567596, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 565 + }, + { + "epoch": 0.4520766773162939, + "grad_norm": 0.2392708659172058, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 566 + }, + { + "epoch": 0.4528753993610224, + "grad_norm": 0.23214633762836456, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 567 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 0.263883501291275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 568 + }, + { + "epoch": 0.4544728434504792, + "grad_norm": 0.19914190471172333, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 569 + }, + { + "epoch": 0.45527156549520764, + "grad_norm": 0.11453433334827423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 570 + }, + { + "epoch": 0.4560702875399361, + "grad_norm": 0.15091221034526825, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 571 + }, + { + "epoch": 0.45686900958466453, + "grad_norm": 0.043582383543252945, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 572 + }, + { + "epoch": 0.45766773162939295, + "grad_norm": 0.14068740606307983, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 573 + }, + { + "epoch": 0.4584664536741214, + "grad_norm": 0.1274290233850479, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 574 + }, + { + "epoch": 0.45926517571884984, + "grad_norm": 0.13504599034786224, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 575 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.1267779916524887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 576 + }, + { + "epoch": 0.46086261980830673, + "grad_norm": 0.08138085901737213, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 577 + }, + { + "epoch": 0.46166134185303515, + "grad_norm": 0.07772356271743774, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 578 + }, + { + "epoch": 0.46246006389776356, + "grad_norm": 0.06863631308078766, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 579 + }, + { + "epoch": 0.46325878594249204, + "grad_norm": 0.1232575923204422, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 580 + }, + { + "epoch": 0.46405750798722045, + "grad_norm": 0.179134801030159, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 581 + }, + { + "epoch": 0.46485623003194887, + "grad_norm": 0.20545582473278046, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 582 + }, + { + "epoch": 0.46565495207667734, + "grad_norm": 0.14182575047016144, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 583 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 0.05813328176736832, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 584 + }, + { + "epoch": 0.4672523961661342, + "grad_norm": 0.1530984789133072, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 585 + }, + { + "epoch": 0.4680511182108626, + "grad_norm": 0.2820036709308624, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 586 + }, + { + "epoch": 0.46884984025559107, + "grad_norm": 0.39252954721450806, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 587 + }, + { + "epoch": 0.4696485623003195, + "grad_norm": 0.40830549597740173, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 588 + }, + { + "epoch": 0.4704472843450479, + "grad_norm": 0.2846182882785797, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 589 + }, + { + "epoch": 0.4712460063897764, + "grad_norm": 0.06798163801431656, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 590 + }, + { + "epoch": 0.4720447284345048, + "grad_norm": 0.18650950491428375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 591 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.2965260446071625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 592 + }, + { + "epoch": 0.4736421725239617, + "grad_norm": 0.24504852294921875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 593 + }, + { + "epoch": 0.4744408945686901, + "grad_norm": 0.11336984485387802, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 594 + }, + { + "epoch": 0.4752396166134185, + "grad_norm": 0.09007567912340164, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 595 + }, + { + "epoch": 0.476038338658147, + "grad_norm": 0.225834459066391, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 596 + }, + { + "epoch": 0.4768370607028754, + "grad_norm": 0.2679842710494995, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 597 + }, + { + "epoch": 0.4776357827476038, + "grad_norm": 0.1801901012659073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 598 + }, + { + "epoch": 0.4784345047923323, + "grad_norm": 0.09554167836904526, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 599 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 0.046632468700408936, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 600 + }, + { + "epoch": 0.48003194888178913, + "grad_norm": 0.12078758329153061, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 601 + }, + { + "epoch": 0.48083067092651754, + "grad_norm": 0.12126865237951279, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 602 + }, + { + "epoch": 0.481629392971246, + "grad_norm": 0.14078640937805176, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 603 + }, + { + "epoch": 0.48242811501597443, + "grad_norm": 0.18556037545204163, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 604 + }, + { + "epoch": 0.48322683706070285, + "grad_norm": 0.178151473402977, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 605 + }, + { + "epoch": 0.4840255591054313, + "grad_norm": 0.1672516018152237, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 606 + }, + { + "epoch": 0.48482428115015974, + "grad_norm": 0.11648737639188766, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 607 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.11820051819086075, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 608 + }, + { + "epoch": 0.48642172523961663, + "grad_norm": 0.21110932528972626, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 609 + }, + { + "epoch": 0.48722044728434505, + "grad_norm": 0.24852754175662994, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 610 + }, + { + "epoch": 0.48801916932907347, + "grad_norm": 0.2633175551891327, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 611 + }, + { + "epoch": 0.48881789137380194, + "grad_norm": 0.21904303133487701, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 612 + }, + { + "epoch": 0.48961661341853036, + "grad_norm": 0.07822466641664505, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 613 + }, + { + "epoch": 0.4904153354632588, + "grad_norm": 0.0767827108502388, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 614 + }, + { + "epoch": 0.49121405750798725, + "grad_norm": 0.07943699508905411, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 615 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 0.055741772055625916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 616 + }, + { + "epoch": 0.4928115015974441, + "grad_norm": 0.10400068014860153, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 617 + }, + { + "epoch": 0.4936102236421725, + "grad_norm": 0.05080602690577507, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 618 + }, + { + "epoch": 0.49440894568690097, + "grad_norm": 0.07927533984184265, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 619 + }, + { + "epoch": 0.4952076677316294, + "grad_norm": 0.07919944822788239, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 620 + }, + { + "epoch": 0.4960063897763578, + "grad_norm": 0.11013699322938919, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 621 + }, + { + "epoch": 0.4968051118210863, + "grad_norm": 0.16232389211654663, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 622 + }, + { + "epoch": 0.4976038338658147, + "grad_norm": 0.17625346779823303, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 623 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.1681327521800995, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 624 + }, + { + "epoch": 0.4992012779552716, + "grad_norm": 0.1882159262895584, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 625 + }, + { + "epoch": 0.5, + "grad_norm": 0.21075129508972168, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 626 + }, + { + "epoch": 0.5007987220447284, + "grad_norm": 0.1464296281337738, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 627 + }, + { + "epoch": 0.5015974440894568, + "grad_norm": 0.11155212670564651, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 628 + }, + { + "epoch": 0.5023961661341853, + "grad_norm": 0.09794416278600693, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 629 + }, + { + "epoch": 0.5031948881789138, + "grad_norm": 0.12095183879137039, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 630 + }, + { + "epoch": 0.5039936102236422, + "grad_norm": 0.1933794617652893, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 631 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 0.32272887229919434, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 632 + }, + { + "epoch": 0.505591054313099, + "grad_norm": 0.2507671117782593, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 633 + }, + { + "epoch": 0.5063897763578274, + "grad_norm": 0.09540661424398422, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 634 + }, + { + "epoch": 0.5071884984025559, + "grad_norm": 0.07341819256544113, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 635 + }, + { + "epoch": 0.5079872204472844, + "grad_norm": 0.11610874533653259, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 636 + }, + { + "epoch": 0.5087859424920128, + "grad_norm": 0.1338607519865036, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 637 + }, + { + "epoch": 0.5095846645367412, + "grad_norm": 0.07892445474863052, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 638 + }, + { + "epoch": 0.5103833865814696, + "grad_norm": 0.053661834448575974, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 639 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.06852453202009201, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 640 + }, + { + "epoch": 0.5119808306709265, + "grad_norm": 0.045109208673238754, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 641 + }, + { + "epoch": 0.512779552715655, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 642 + }, + { + "epoch": 0.5135782747603834, + "grad_norm": 0.05903350189328194, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 643 + }, + { + "epoch": 0.5143769968051118, + "grad_norm": 0.07314767688512802, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 644 + }, + { + "epoch": 0.5151757188498403, + "grad_norm": 0.12484236806631088, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 645 + }, + { + "epoch": 0.5159744408945687, + "grad_norm": 0.15683352947235107, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 646 + }, + { + "epoch": 0.5167731629392971, + "grad_norm": 0.13519413769245148, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 647 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 0.10333485156297684, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 648 + }, + { + "epoch": 0.518370607028754, + "grad_norm": 0.09626923501491547, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 649 + }, + { + "epoch": 0.5191693290734825, + "grad_norm": 0.08177447319030762, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 650 + }, + { + "epoch": 0.5199680511182109, + "grad_norm": 0.04186684265732765, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 651 + }, + { + "epoch": 0.5207667731629393, + "grad_norm": 0.07705547660589218, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 652 + }, + { + "epoch": 0.5215654952076677, + "grad_norm": 0.05885700136423111, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 653 + }, + { + "epoch": 0.5223642172523961, + "grad_norm": 0.14140211045742035, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 654 + }, + { + "epoch": 0.5231629392971247, + "grad_norm": 0.18797138333320618, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 655 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.2301982045173645, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 656 + }, + { + "epoch": 0.5247603833865815, + "grad_norm": 0.2813114523887634, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 657 + }, + { + "epoch": 0.5255591054313099, + "grad_norm": 0.3205592930316925, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 658 + }, + { + "epoch": 0.5263578274760383, + "grad_norm": 0.3426150381565094, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 659 + }, + { + "epoch": 0.5271565495207667, + "grad_norm": 0.2636663615703583, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 660 + }, + { + "epoch": 0.5279552715654952, + "grad_norm": 0.14799079298973083, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 661 + }, + { + "epoch": 0.5287539936102237, + "grad_norm": 0.06354992836713791, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 662 + }, + { + "epoch": 0.5295527156549521, + "grad_norm": 0.239300936460495, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 663 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 0.33535388112068176, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 664 + }, + { + "epoch": 0.5311501597444089, + "grad_norm": 0.32471078634262085, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 665 + }, + { + "epoch": 0.5319488817891374, + "grad_norm": 0.2491266429424286, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 666 + }, + { + "epoch": 0.5327476038338658, + "grad_norm": 0.09841614216566086, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 667 + }, + { + "epoch": 0.5335463258785943, + "grad_norm": 0.1310579627752304, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 668 + }, + { + "epoch": 0.5343450479233227, + "grad_norm": 0.28287971019744873, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 669 + }, + { + "epoch": 0.5351437699680511, + "grad_norm": 0.3457719385623932, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 670 + }, + { + "epoch": 0.5359424920127795, + "grad_norm": 0.31690946221351624, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 671 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.19356760382652283, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 672 + }, + { + "epoch": 0.5375399361022364, + "grad_norm": 0.05940595269203186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 673 + }, + { + "epoch": 0.5383386581469649, + "grad_norm": 0.20772181451320648, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 674 + }, + { + "epoch": 0.5391373801916933, + "grad_norm": 0.3093980848789215, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 675 + }, + { + "epoch": 0.5399361022364217, + "grad_norm": 0.2632107734680176, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 676 + }, + { + "epoch": 0.5407348242811502, + "grad_norm": 0.12365782260894775, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 677 + }, + { + "epoch": 0.5415335463258786, + "grad_norm": 0.07215466350317001, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 678 + }, + { + "epoch": 0.542332268370607, + "grad_norm": 0.16745947301387787, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 679 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 0.14418186247348785, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 680 + }, + { + "epoch": 0.5439297124600639, + "grad_norm": 0.048094023019075394, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 681 + }, + { + "epoch": 0.5447284345047924, + "grad_norm": 0.10100048035383224, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 682 + }, + { + "epoch": 0.5455271565495208, + "grad_norm": 0.13719545304775238, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 683 + }, + { + "epoch": 0.5463258785942492, + "grad_norm": 0.16066808998584747, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 684 + }, + { + "epoch": 0.5471246006389776, + "grad_norm": 0.19201414287090302, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 685 + }, + { + "epoch": 0.547923322683706, + "grad_norm": 0.19783100485801697, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 686 + }, + { + "epoch": 0.5487220447284346, + "grad_norm": 0.1431797295808792, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 687 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.04368956387042999, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 688 + }, + { + "epoch": 0.5503194888178914, + "grad_norm": 0.12395253777503967, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 689 + }, + { + "epoch": 0.5511182108626198, + "grad_norm": 0.16278770565986633, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 690 + }, + { + "epoch": 0.5519169329073482, + "grad_norm": 0.15368889272212982, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 691 + }, + { + "epoch": 0.5527156549520766, + "grad_norm": 0.10195931792259216, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 692 + }, + { + "epoch": 0.5535143769968051, + "grad_norm": 0.03421236202120781, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 693 + }, + { + "epoch": 0.5543130990415336, + "grad_norm": 0.09549148380756378, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 694 + }, + { + "epoch": 0.555111821086262, + "grad_norm": 0.17825989425182343, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 695 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 0.25296247005462646, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 696 + }, + { + "epoch": 0.5567092651757188, + "grad_norm": 0.27566400170326233, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 697 + }, + { + "epoch": 0.5575079872204473, + "grad_norm": 0.22609780728816986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 698 + }, + { + "epoch": 0.5583067092651757, + "grad_norm": 0.10555832833051682, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 699 + }, + { + "epoch": 0.5591054313099042, + "grad_norm": 0.1309640258550644, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 700 + }, + { + "epoch": 0.5599041533546326, + "grad_norm": 0.3434476852416992, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 701 + }, + { + "epoch": 0.560702875399361, + "grad_norm": 0.4559882581233978, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 702 + }, + { + "epoch": 0.5615015974440895, + "grad_norm": 0.390683650970459, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 703 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.14178164303302765, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 704 + }, + { + "epoch": 0.5630990415335463, + "grad_norm": 0.19113974273204803, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 705 + }, + { + "epoch": 0.5638977635782748, + "grad_norm": 0.38376086950302124, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 706 + }, + { + "epoch": 0.5646964856230032, + "grad_norm": 0.3486707806587219, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 707 + }, + { + "epoch": 0.5654952076677316, + "grad_norm": 0.14712302386760712, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 708 + }, + { + "epoch": 0.5662939297124601, + "grad_norm": 0.11827494204044342, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 709 + }, + { + "epoch": 0.5670926517571885, + "grad_norm": 0.27573689818382263, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 710 + }, + { + "epoch": 0.5678913738019169, + "grad_norm": 0.2983379065990448, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 711 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 0.2019582986831665, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 712 + }, + { + "epoch": 0.5694888178913738, + "grad_norm": 0.04186725243926048, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 713 + }, + { + "epoch": 0.5702875399361023, + "grad_norm": 0.16714231669902802, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 714 + }, + { + "epoch": 0.5710862619808307, + "grad_norm": 0.24982011318206787, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 715 + }, + { + "epoch": 0.5718849840255591, + "grad_norm": 0.22021397948265076, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 716 + }, + { + "epoch": 0.5726837060702875, + "grad_norm": 0.09717470407485962, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 717 + }, + { + "epoch": 0.5734824281150159, + "grad_norm": 0.10214962065219879, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 718 + }, + { + "epoch": 0.5742811501597445, + "grad_norm": 0.15325960516929626, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 719 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.11207877099514008, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 720 + }, + { + "epoch": 0.5758785942492013, + "grad_norm": 0.05425047129392624, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 721 + }, + { + "epoch": 0.5766773162939297, + "grad_norm": 0.0703732892870903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 722 + }, + { + "epoch": 0.5774760383386581, + "grad_norm": 0.10577918589115143, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 723 + }, + { + "epoch": 0.5782747603833865, + "grad_norm": 0.13230514526367188, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 724 + }, + { + "epoch": 0.579073482428115, + "grad_norm": 0.1878778040409088, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 725 + }, + { + "epoch": 0.5798722044728435, + "grad_norm": 0.19956567883491516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 726 + }, + { + "epoch": 0.5806709265175719, + "grad_norm": 0.13732020556926727, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 727 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 0.09844338148832321, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 728 + }, + { + "epoch": 0.5822683706070287, + "grad_norm": 0.056577637791633606, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 729 + }, + { + "epoch": 0.5830670926517572, + "grad_norm": 0.0835585743188858, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 730 + }, + { + "epoch": 0.5838658146964856, + "grad_norm": 0.0910082757472992, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 731 + }, + { + "epoch": 0.5846645367412141, + "grad_norm": 0.0659257099032402, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 732 + }, + { + "epoch": 0.5854632587859425, + "grad_norm": 0.09342535585165024, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 733 + }, + { + "epoch": 0.5862619808306709, + "grad_norm": 0.0627603679895401, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 734 + }, + { + "epoch": 0.5870607028753994, + "grad_norm": 0.10535050183534622, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 735 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.13628117740154266, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 736 + }, + { + "epoch": 0.5886581469648562, + "grad_norm": 0.0715300589799881, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 737 + }, + { + "epoch": 0.5894568690095847, + "grad_norm": 0.10892884433269501, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 738 + }, + { + "epoch": 0.5902555910543131, + "grad_norm": 0.09805259853601456, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 739 + }, + { + "epoch": 0.5910543130990416, + "grad_norm": 0.14491751790046692, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 740 + }, + { + "epoch": 0.59185303514377, + "grad_norm": 0.15448585152626038, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 741 + }, + { + "epoch": 0.5926517571884984, + "grad_norm": 0.08218494802713394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 742 + }, + { + "epoch": 0.5934504792332268, + "grad_norm": 0.16311237215995789, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 743 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 0.10310494899749756, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 744 + }, + { + "epoch": 0.5950479233226837, + "grad_norm": 0.1511978805065155, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 745 + }, + { + "epoch": 0.5958466453674122, + "grad_norm": 0.20440778136253357, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 746 + }, + { + "epoch": 0.5966453674121406, + "grad_norm": 0.20918506383895874, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 747 + }, + { + "epoch": 0.597444089456869, + "grad_norm": 0.20070627331733704, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 748 + }, + { + "epoch": 0.5982428115015974, + "grad_norm": 0.1142180860042572, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 749 + }, + { + "epoch": 0.5990415335463258, + "grad_norm": 0.09418357163667679, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 750 + }, + { + "epoch": 0.5998402555910544, + "grad_norm": 0.24306562542915344, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 751 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.3208121955394745, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 752 + }, + { + "epoch": 0.6014376996805112, + "grad_norm": 0.3070276081562042, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 753 + }, + { + "epoch": 0.6022364217252396, + "grad_norm": 0.17130877077579498, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 754 + }, + { + "epoch": 0.603035143769968, + "grad_norm": 0.0733935534954071, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 755 + }, + { + "epoch": 0.6038338658146964, + "grad_norm": 0.25525134801864624, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 756 + }, + { + "epoch": 0.604632587859425, + "grad_norm": 0.39397957921028137, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 757 + }, + { + "epoch": 0.6054313099041534, + "grad_norm": 0.39015471935272217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 758 + }, + { + "epoch": 0.6062300319488818, + "grad_norm": 0.1757609099149704, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 759 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 0.19901637732982635, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 760 + }, + { + "epoch": 0.6078274760383386, + "grad_norm": 0.46885979175567627, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 761 + }, + { + "epoch": 0.6086261980830671, + "grad_norm": 0.4650067687034607, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 762 + }, + { + "epoch": 0.6094249201277955, + "grad_norm": 0.16624194383621216, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 763 + }, + { + "epoch": 0.610223642172524, + "grad_norm": 0.23347698152065277, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 764 + }, + { + "epoch": 0.6110223642172524, + "grad_norm": 0.40192991495132446, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 765 + }, + { + "epoch": 0.6118210862619808, + "grad_norm": 0.33640867471694946, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 766 + }, + { + "epoch": 0.6126198083067093, + "grad_norm": 0.11979667842388153, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 767 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.17994286119937897, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 768 + }, + { + "epoch": 0.6142172523961661, + "grad_norm": 0.2693847715854645, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 769 + }, + { + "epoch": 0.6150159744408946, + "grad_norm": 0.2041584849357605, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 770 + }, + { + "epoch": 0.615814696485623, + "grad_norm": 0.052040908485651016, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 771 + }, + { + "epoch": 0.6166134185303515, + "grad_norm": 0.18652868270874023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 772 + }, + { + "epoch": 0.6174121405750799, + "grad_norm": 0.26122182607650757, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 773 + }, + { + "epoch": 0.6182108626198083, + "grad_norm": 0.15385891497135162, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 774 + }, + { + "epoch": 0.6190095846645367, + "grad_norm": 0.09217085689306259, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 775 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 0.23316404223442078, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 776 + }, + { + "epoch": 0.6206070287539937, + "grad_norm": 0.24094274640083313, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 777 + }, + { + "epoch": 0.6214057507987221, + "grad_norm": 0.08518059551715851, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 778 + }, + { + "epoch": 0.6222044728434505, + "grad_norm": 0.11076594144105911, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 779 + }, + { + "epoch": 0.6230031948881789, + "grad_norm": 0.1963978409767151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 780 + }, + { + "epoch": 0.6238019169329073, + "grad_norm": 0.1526973396539688, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 781 + }, + { + "epoch": 0.6246006389776357, + "grad_norm": 0.09434971958398819, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 782 + }, + { + "epoch": 0.6253993610223643, + "grad_norm": 0.2677021622657776, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 783 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.2885434329509735, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 784 + }, + { + "epoch": 0.6269968051118211, + "grad_norm": 0.14111816883087158, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 785 + }, + { + "epoch": 0.6277955271565495, + "grad_norm": 0.06594719737768173, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 786 + }, + { + "epoch": 0.6285942492012779, + "grad_norm": 0.09837283194065094, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 787 + }, + { + "epoch": 0.6293929712460063, + "grad_norm": 0.06089933589100838, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 788 + }, + { + "epoch": 0.6301916932907349, + "grad_norm": 0.16248181462287903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 789 + }, + { + "epoch": 0.6309904153354633, + "grad_norm": 0.298454612493515, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 790 + }, + { + "epoch": 0.6317891373801917, + "grad_norm": 0.3365437090396881, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 791 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 0.22858452796936035, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 792 + }, + { + "epoch": 0.6333865814696485, + "grad_norm": 0.04849984869360924, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 793 + }, + { + "epoch": 0.634185303514377, + "grad_norm": 0.24791331589221954, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 794 + }, + { + "epoch": 0.6349840255591054, + "grad_norm": 0.3028055727481842, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 795 + }, + { + "epoch": 0.6357827476038339, + "grad_norm": 0.15674540400505066, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 796 + }, + { + "epoch": 0.6365814696485623, + "grad_norm": 0.08521793782711029, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 797 + }, + { + "epoch": 0.6373801916932907, + "grad_norm": 0.21750952303409576, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 798 + }, + { + "epoch": 0.6381789137380192, + "grad_norm": 0.18880338966846466, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 799 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.06699419766664505, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 800 + }, + { + "epoch": 0.639776357827476, + "grad_norm": 0.08062998205423355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 801 + }, + { + "epoch": 0.6405750798722045, + "grad_norm": 0.10635658353567123, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 802 + }, + { + "epoch": 0.6413738019169329, + "grad_norm": 0.05086763948202133, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 803 + }, + { + "epoch": 0.6421725239616614, + "grad_norm": 0.09852107614278793, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 804 + }, + { + "epoch": 0.6429712460063898, + "grad_norm": 0.11290771514177322, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 805 + }, + { + "epoch": 0.6437699680511182, + "grad_norm": 0.15106825530529022, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 806 + }, + { + "epoch": 0.6445686900958466, + "grad_norm": 0.13646326959133148, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 807 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 0.06398668140172958, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 808 + }, + { + "epoch": 0.6461661341853036, + "grad_norm": 0.11581127345561981, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 809 + }, + { + "epoch": 0.646964856230032, + "grad_norm": 0.15684139728546143, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 810 + }, + { + "epoch": 0.6477635782747604, + "grad_norm": 0.14094121754169464, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 811 + }, + { + "epoch": 0.6485623003194888, + "grad_norm": 0.0938766822218895, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 812 + }, + { + "epoch": 0.6493610223642172, + "grad_norm": 0.06041521951556206, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 813 + }, + { + "epoch": 0.6501597444089456, + "grad_norm": 0.13364291191101074, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 814 + }, + { + "epoch": 0.6509584664536742, + "grad_norm": 0.15577054023742676, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 815 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.1119854673743248, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 816 + }, + { + "epoch": 0.652555910543131, + "grad_norm": 0.07751357555389404, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 817 + }, + { + "epoch": 0.6533546325878594, + "grad_norm": 0.10110143572092056, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 818 + }, + { + "epoch": 0.6541533546325878, + "grad_norm": 0.19627511501312256, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 819 + }, + { + "epoch": 0.6549520766773163, + "grad_norm": 0.19837769865989685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 820 + }, + { + "epoch": 0.6557507987220448, + "grad_norm": 0.13598690927028656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 821 + }, + { + "epoch": 0.6565495207667732, + "grad_norm": 0.05950666591525078, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 822 + }, + { + "epoch": 0.6573482428115016, + "grad_norm": 0.060314662754535675, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 823 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 0.11455138027667999, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 824 + }, + { + "epoch": 0.6589456869009584, + "grad_norm": 0.16753345727920532, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 825 + }, + { + "epoch": 0.6597444089456869, + "grad_norm": 0.15707428753376007, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 826 + }, + { + "epoch": 0.6605431309904153, + "grad_norm": 0.07224153727293015, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 827 + }, + { + "epoch": 0.6613418530351438, + "grad_norm": 0.10538042336702347, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 828 + }, + { + "epoch": 0.6621405750798722, + "grad_norm": 0.18855130672454834, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 829 + }, + { + "epoch": 0.6629392971246006, + "grad_norm": 0.17752179503440857, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 830 + }, + { + "epoch": 0.6637380191693291, + "grad_norm": 0.10109171271324158, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 831 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.15006190538406372, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 832 + }, + { + "epoch": 0.6653354632587859, + "grad_norm": 0.2701014578342438, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 833 + }, + { + "epoch": 0.6661341853035144, + "grad_norm": 0.2607312500476837, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 834 + }, + { + "epoch": 0.6669329073482428, + "grad_norm": 0.19712841510772705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 835 + }, + { + "epoch": 0.6677316293929713, + "grad_norm": 0.0839366614818573, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 836 + }, + { + "epoch": 0.6685303514376997, + "grad_norm": 0.1595088541507721, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 837 + }, + { + "epoch": 0.6693290734824281, + "grad_norm": 0.2773466408252716, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 838 + }, + { + "epoch": 0.6701277955271565, + "grad_norm": 0.24616314470767975, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 839 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 0.15596427023410797, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 840 + }, + { + "epoch": 0.6717252396166135, + "grad_norm": 0.047822993248701096, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 841 + }, + { + "epoch": 0.6725239616613419, + "grad_norm": 0.17692670226097107, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 842 + }, + { + "epoch": 0.6733226837060703, + "grad_norm": 0.1742856502532959, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 843 + }, + { + "epoch": 0.6741214057507987, + "grad_norm": 0.15347127616405487, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 844 + }, + { + "epoch": 0.6749201277955271, + "grad_norm": 0.18238374590873718, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 845 + }, + { + "epoch": 0.6757188498402555, + "grad_norm": 0.1524323672056198, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 846 + }, + { + "epoch": 0.6765175718849841, + "grad_norm": 0.1820068210363388, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 847 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.2010941058397293, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 848 + }, + { + "epoch": 0.6781150159744409, + "grad_norm": 0.16428111493587494, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 849 + }, + { + "epoch": 0.6789137380191693, + "grad_norm": 0.1538572460412979, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 850 + }, + { + "epoch": 0.6797124600638977, + "grad_norm": 0.057427916675806046, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 851 + }, + { + "epoch": 0.6805111821086262, + "grad_norm": 0.08329081535339355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 852 + }, + { + "epoch": 0.6813099041533547, + "grad_norm": 0.05685174837708473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 853 + }, + { + "epoch": 0.6821086261980831, + "grad_norm": 0.15277032554149628, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 854 + }, + { + "epoch": 0.6829073482428115, + "grad_norm": 0.24243640899658203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 855 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 0.28722453117370605, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 856 + }, + { + "epoch": 0.6845047923322684, + "grad_norm": 0.1997309774160385, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 857 + }, + { + "epoch": 0.6853035143769968, + "grad_norm": 0.061719026416540146, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 858 + }, + { + "epoch": 0.6861022364217252, + "grad_norm": 0.23425672948360443, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 859 + }, + { + "epoch": 0.6869009584664537, + "grad_norm": 0.350109726190567, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 860 + }, + { + "epoch": 0.6876996805111821, + "grad_norm": 0.34444838762283325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 861 + }, + { + "epoch": 0.6884984025559105, + "grad_norm": 0.15325413644313812, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 862 + }, + { + "epoch": 0.689297124600639, + "grad_norm": 0.1227702870965004, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 863 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 0.24337291717529297, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 864 + }, + { + "epoch": 0.6908945686900958, + "grad_norm": 0.24047589302062988, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 865 + }, + { + "epoch": 0.6916932907348243, + "grad_norm": 0.13576050102710724, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 866 + }, + { + "epoch": 0.6924920127795527, + "grad_norm": 0.0503714494407177, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 867 + }, + { + "epoch": 0.6932907348242812, + "grad_norm": 0.1292860060930252, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 868 + }, + { + "epoch": 0.6940894568690096, + "grad_norm": 0.14698486030101776, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 869 + }, + { + "epoch": 0.694888178913738, + "grad_norm": 0.07720573991537094, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 870 + }, + { + "epoch": 0.6956869009584664, + "grad_norm": 0.1604471504688263, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 871 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 0.32734861969947815, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 872 + }, + { + "epoch": 0.6972843450479234, + "grad_norm": 0.32366684079170227, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 873 + }, + { + "epoch": 0.6980830670926518, + "grad_norm": 0.18428802490234375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 874 + }, + { + "epoch": 0.6988817891373802, + "grad_norm": 0.07498858869075775, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 875 + }, + { + "epoch": 0.6996805111821086, + "grad_norm": 0.24449816346168518, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 876 + }, + { + "epoch": 0.700479233226837, + "grad_norm": 0.26649829745292664, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 877 + }, + { + "epoch": 0.7012779552715654, + "grad_norm": 0.1315024197101593, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 878 + }, + { + "epoch": 0.702076677316294, + "grad_norm": 0.10907325148582458, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 879 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.2364589273929596, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 880 + }, + { + "epoch": 0.7036741214057508, + "grad_norm": 0.1663885861635208, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 881 + }, + { + "epoch": 0.7044728434504792, + "grad_norm": 0.0596470907330513, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 882 + }, + { + "epoch": 0.7052715654952076, + "grad_norm": 0.1519233137369156, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 883 + }, + { + "epoch": 0.7060702875399361, + "grad_norm": 0.23089520633220673, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 884 + }, + { + "epoch": 0.7068690095846646, + "grad_norm": 0.20667214691638947, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 885 + }, + { + "epoch": 0.707667731629393, + "grad_norm": 0.10739922523498535, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 886 + }, + { + "epoch": 0.7084664536741214, + "grad_norm": 0.04334057494997978, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 887 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 0.15619881451129913, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 888 + }, + { + "epoch": 0.7100638977635783, + "grad_norm": 0.26618269085884094, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 889 + }, + { + "epoch": 0.7108626198083067, + "grad_norm": 0.1834406554698944, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 890 + }, + { + "epoch": 0.7116613418530351, + "grad_norm": 0.08332087099552155, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 891 + }, + { + "epoch": 0.7124600638977636, + "grad_norm": 0.23721523582935333, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 892 + }, + { + "epoch": 0.713258785942492, + "grad_norm": 0.2912815809249878, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 893 + }, + { + "epoch": 0.7140575079872205, + "grad_norm": 0.25534820556640625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 894 + }, + { + "epoch": 0.7148562300319489, + "grad_norm": 0.14200575649738312, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 895 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.08668249845504761, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 896 + }, + { + "epoch": 0.7164536741214057, + "grad_norm": 0.2358543574810028, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 897 + }, + { + "epoch": 0.7172523961661342, + "grad_norm": 0.2729748487472534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 898 + }, + { + "epoch": 0.7180511182108626, + "grad_norm": 0.14862589538097382, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 899 + }, + { + "epoch": 0.7188498402555911, + "grad_norm": 0.14500044286251068, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 900 + }, + { + "epoch": 0.7196485623003195, + "grad_norm": 0.28659892082214355, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 901 + }, + { + "epoch": 0.7204472843450479, + "grad_norm": 0.2974075376987457, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 902 + }, + { + "epoch": 0.7212460063897763, + "grad_norm": 0.07839605212211609, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 903 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 0.2542141079902649, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 904 + }, + { + "epoch": 0.7228434504792333, + "grad_norm": 0.357192724943161, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 905 + }, + { + "epoch": 0.7236421725239617, + "grad_norm": 0.21535371243953705, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 906 + }, + { + "epoch": 0.7244408945686901, + "grad_norm": 0.08053386211395264, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 907 + }, + { + "epoch": 0.7252396166134185, + "grad_norm": 0.22670729458332062, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 908 + }, + { + "epoch": 0.7260383386581469, + "grad_norm": 0.21510791778564453, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 909 + }, + { + "epoch": 0.7268370607028753, + "grad_norm": 0.07556216418743134, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 910 + }, + { + "epoch": 0.7276357827476039, + "grad_norm": 0.08772645890712738, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 911 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.2531013488769531, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 912 + }, + { + "epoch": 0.7292332268370607, + "grad_norm": 0.06658858805894852, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 913 + }, + { + "epoch": 0.7300319488817891, + "grad_norm": 0.09869293123483658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 914 + }, + { + "epoch": 0.7308306709265175, + "grad_norm": 0.17758162319660187, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 915 + }, + { + "epoch": 0.731629392971246, + "grad_norm": 0.16267521679401398, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 916 + }, + { + "epoch": 0.7324281150159745, + "grad_norm": 0.09948690980672836, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 917 + }, + { + "epoch": 0.7332268370607029, + "grad_norm": 0.05900302529335022, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 918 + }, + { + "epoch": 0.7340255591054313, + "grad_norm": 0.08200150728225708, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 919 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 0.09217624366283417, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 920 + }, + { + "epoch": 0.7356230031948882, + "grad_norm": 0.12414196133613586, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 921 + }, + { + "epoch": 0.7364217252396166, + "grad_norm": 0.131890669465065, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 922 + }, + { + "epoch": 0.737220447284345, + "grad_norm": 0.1187182292342186, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 923 + }, + { + "epoch": 0.7380191693290735, + "grad_norm": 0.09890205413103104, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 924 + }, + { + "epoch": 0.7388178913738019, + "grad_norm": 0.06730851531028748, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 925 + }, + { + "epoch": 0.7396166134185304, + "grad_norm": 0.038627006113529205, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 926 + }, + { + "epoch": 0.7404153354632588, + "grad_norm": 0.07148899137973785, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 927 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 0.05876476690173149, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 928 + }, + { + "epoch": 0.7420127795527156, + "grad_norm": 0.11069595813751221, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 929 + }, + { + "epoch": 0.7428115015974441, + "grad_norm": 0.10409362614154816, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 930 + }, + { + "epoch": 0.7436102236421726, + "grad_norm": 0.08115468919277191, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 931 + }, + { + "epoch": 0.744408945686901, + "grad_norm": 0.14105193316936493, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 932 + }, + { + "epoch": 0.7452076677316294, + "grad_norm": 0.07780246436595917, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 933 + }, + { + "epoch": 0.7460063897763578, + "grad_norm": 0.08895678073167801, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 934 + }, + { + "epoch": 0.7468051118210862, + "grad_norm": 0.10844068974256516, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 935 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 0.07179753482341766, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 936 + }, + { + "epoch": 0.7484025559105432, + "grad_norm": 0.11107192933559418, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 937 + }, + { + "epoch": 0.7492012779552716, + "grad_norm": 0.2845052480697632, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 938 + }, + { + "epoch": 0.75, + "grad_norm": 0.41480058431625366, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 939 + }, + { + "epoch": 0.7507987220447284, + "grad_norm": 0.3101426064968109, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 940 + }, + { + "epoch": 0.7515974440894568, + "grad_norm": 0.09521801024675369, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 941 + }, + { + "epoch": 0.7523961661341853, + "grad_norm": 0.18613341450691223, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 942 + }, + { + "epoch": 0.7531948881789138, + "grad_norm": 0.2665672302246094, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 943 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.20693817734718323, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 944 + }, + { + "epoch": 0.7547923322683706, + "grad_norm": 0.05853262171149254, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 945 + }, + { + "epoch": 0.755591054313099, + "grad_norm": 0.22123664617538452, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 946 + }, + { + "epoch": 0.7563897763578274, + "grad_norm": 0.2845379114151001, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 947 + }, + { + "epoch": 0.7571884984025559, + "grad_norm": 0.20357397198677063, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 948 + }, + { + "epoch": 0.7579872204472844, + "grad_norm": 0.0897352546453476, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 949 + }, + { + "epoch": 0.7587859424920128, + "grad_norm": 0.06572771817445755, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 950 + }, + { + "epoch": 0.7595846645367412, + "grad_norm": 0.09441806375980377, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 951 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 0.06848953664302826, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 952 + }, + { + "epoch": 0.7611821086261981, + "grad_norm": 0.127177432179451, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 953 + }, + { + "epoch": 0.7619808306709265, + "grad_norm": 0.25466713309288025, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 954 + }, + { + "epoch": 0.762779552715655, + "grad_norm": 0.32952556014060974, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 955 + }, + { + "epoch": 0.7635782747603834, + "grad_norm": 0.2976897358894348, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 956 + }, + { + "epoch": 0.7643769968051118, + "grad_norm": 0.17444387078285217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 957 + }, + { + "epoch": 0.7651757188498403, + "grad_norm": 0.10458981990814209, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 958 + }, + { + "epoch": 0.7659744408945687, + "grad_norm": 0.07028939574956894, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 959 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.1888386309146881, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 960 + }, + { + "epoch": 0.7675718849840255, + "grad_norm": 0.19400012493133545, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 961 + }, + { + "epoch": 0.768370607028754, + "grad_norm": 0.12069790065288544, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 962 + }, + { + "epoch": 0.7691693290734825, + "grad_norm": 0.06206851452589035, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 963 + }, + { + "epoch": 0.7699680511182109, + "grad_norm": 0.07195326685905457, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 964 + }, + { + "epoch": 0.7707667731629393, + "grad_norm": 0.09240477532148361, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 965 + }, + { + "epoch": 0.7715654952076677, + "grad_norm": 0.04433378204703331, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 966 + }, + { + "epoch": 0.7723642172523961, + "grad_norm": 0.07411819696426392, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 967 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 0.11440210789442062, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 968 + }, + { + "epoch": 0.7739616613418531, + "grad_norm": 0.23913118243217468, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 969 + }, + { + "epoch": 0.7747603833865815, + "grad_norm": 0.31028202176094055, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 970 + }, + { + "epoch": 0.7755591054313099, + "grad_norm": 0.3343825936317444, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 971 + }, + { + "epoch": 0.7763578274760383, + "grad_norm": 0.2559935748577118, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 972 + }, + { + "epoch": 0.7771565495207667, + "grad_norm": 0.05685359239578247, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 973 + }, + { + "epoch": 0.7779552715654952, + "grad_norm": 0.1760183721780777, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 974 + }, + { + "epoch": 0.7787539936102237, + "grad_norm": 0.25240832567214966, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 975 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.13724291324615479, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 976 + }, + { + "epoch": 0.7803514376996805, + "grad_norm": 0.11687567830085754, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 977 + }, + { + "epoch": 0.7811501597444089, + "grad_norm": 0.31319329142570496, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 978 + }, + { + "epoch": 0.7819488817891374, + "grad_norm": 0.3297184705734253, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 979 + }, + { + "epoch": 0.7827476038338658, + "grad_norm": 0.19443389773368835, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 980 + }, + { + "epoch": 0.7835463258785943, + "grad_norm": 0.04911043494939804, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 981 + }, + { + "epoch": 0.7843450479233227, + "grad_norm": 0.19837717711925507, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 982 + }, + { + "epoch": 0.7851437699680511, + "grad_norm": 0.23165349662303925, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 983 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 0.12156365066766739, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 984 + }, + { + "epoch": 0.786741214057508, + "grad_norm": 0.1305016428232193, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 985 + }, + { + "epoch": 0.7875399361022364, + "grad_norm": 0.12228422611951828, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 986 + }, + { + "epoch": 0.7883386581469649, + "grad_norm": 0.09014695137739182, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 987 + }, + { + "epoch": 0.7891373801916933, + "grad_norm": 0.060052234679460526, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 988 + }, + { + "epoch": 0.7899361022364217, + "grad_norm": 0.17842933535575867, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 989 + }, + { + "epoch": 0.7907348242811502, + "grad_norm": 0.2823020815849304, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 990 + }, + { + "epoch": 0.7915335463258786, + "grad_norm": 0.2571483254432678, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 991 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.11443623155355453, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 992 + }, + { + "epoch": 0.7931309904153354, + "grad_norm": 0.09048285335302353, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 993 + }, + { + "epoch": 0.7939297124600639, + "grad_norm": 0.1863749772310257, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 994 + }, + { + "epoch": 0.7947284345047924, + "grad_norm": 0.1481461524963379, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 995 + }, + { + "epoch": 0.7955271565495208, + "grad_norm": 0.06870540231466293, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 996 + }, + { + "epoch": 0.7963258785942492, + "grad_norm": 0.04223543405532837, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 997 + }, + { + "epoch": 0.7971246006389776, + "grad_norm": 0.04194851219654083, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 998 + }, + { + "epoch": 0.797923322683706, + "grad_norm": 0.03982497751712799, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 999 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 0.20985758304595947, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1000 + }, + { + "epoch": 0.799520766773163, + "grad_norm": 0.11346526443958282, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1001 + }, + { + "epoch": 0.8003194888178914, + "grad_norm": 0.16594401001930237, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1002 + }, + { + "epoch": 0.8011182108626198, + "grad_norm": 0.1788545846939087, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1003 + }, + { + "epoch": 0.8019169329073482, + "grad_norm": 0.07928512245416641, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1004 + }, + { + "epoch": 0.8027156549520766, + "grad_norm": 0.0953991562128067, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1005 + }, + { + "epoch": 0.8035143769968051, + "grad_norm": 0.2052081823348999, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1006 + }, + { + "epoch": 0.8043130990415336, + "grad_norm": 0.1999465525150299, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1007 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.09821965545415878, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1008 + }, + { + "epoch": 0.8059105431309904, + "grad_norm": 0.0762021467089653, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1009 + }, + { + "epoch": 0.8067092651757188, + "grad_norm": 0.20475991070270538, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 0.8075079872204473, + "grad_norm": 0.23028631508350372, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1011 + }, + { + "epoch": 0.8083067092651757, + "grad_norm": 0.12122747302055359, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.8091054313099042, + "grad_norm": 0.08124672621488571, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1013 + }, + { + "epoch": 0.8099041533546326, + "grad_norm": 0.21313415467739105, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1014 + }, + { + "epoch": 0.810702875399361, + "grad_norm": 0.311813622713089, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1015 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 0.3032541275024414, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1016 + }, + { + "epoch": 0.8123003194888179, + "grad_norm": 0.21727560460567474, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1017 + }, + { + "epoch": 0.8130990415335463, + "grad_norm": 0.0620480477809906, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1018 + }, + { + "epoch": 0.8138977635782748, + "grad_norm": 0.20105740427970886, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1019 + }, + { + "epoch": 0.8146964856230032, + "grad_norm": 0.28996244072914124, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1020 + }, + { + "epoch": 0.8154952076677316, + "grad_norm": 0.22115157544612885, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1021 + }, + { + "epoch": 0.8162939297124601, + "grad_norm": 0.10071029514074326, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1022 + }, + { + "epoch": 0.8170926517571885, + "grad_norm": 0.12363877147436142, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1023 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.29970163106918335, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1024 + }, + { + "epoch": 0.8186900958466453, + "grad_norm": 0.32754749059677124, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1025 + }, + { + "epoch": 0.8194888178913738, + "grad_norm": 0.20028825104236603, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1026 + }, + { + "epoch": 0.8202875399361023, + "grad_norm": 0.08162792772054672, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1027 + }, + { + "epoch": 0.8210862619808307, + "grad_norm": 0.27463749051094055, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1028 + }, + { + "epoch": 0.8218849840255591, + "grad_norm": 0.30335354804992676, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1029 + }, + { + "epoch": 0.8226837060702875, + "grad_norm": 0.12106633186340332, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1030 + }, + { + "epoch": 0.8234824281150159, + "grad_norm": 0.16331955790519714, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1031 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 0.2764187455177307, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1032 + }, + { + "epoch": 0.8250798722044729, + "grad_norm": 0.20136456191539764, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1033 + }, + { + "epoch": 0.8258785942492013, + "grad_norm": 0.06438590586185455, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1034 + }, + { + "epoch": 0.8266773162939297, + "grad_norm": 0.18764367699623108, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1035 + }, + { + "epoch": 0.8274760383386581, + "grad_norm": 0.20327645540237427, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1036 + }, + { + "epoch": 0.8282747603833865, + "grad_norm": 0.08825036138296127, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1037 + }, + { + "epoch": 0.829073482428115, + "grad_norm": 0.11037785559892654, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1038 + }, + { + "epoch": 0.8298722044728435, + "grad_norm": 0.18273280560970306, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1039 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.16820372641086578, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1040 + }, + { + "epoch": 0.8314696485623003, + "grad_norm": 0.06250625103712082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1041 + }, + { + "epoch": 0.8322683706070287, + "grad_norm": 0.12141115218400955, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1042 + }, + { + "epoch": 0.8330670926517572, + "grad_norm": 0.13594450056552887, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1043 + }, + { + "epoch": 0.8338658146964856, + "grad_norm": 0.16069599986076355, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1044 + }, + { + "epoch": 0.8346645367412141, + "grad_norm": 0.11631255596876144, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1045 + }, + { + "epoch": 0.8354632587859425, + "grad_norm": 0.050075192004442215, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1046 + }, + { + "epoch": 0.8362619808306709, + "grad_norm": 0.06317511945962906, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1047 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 0.09078527241945267, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1048 + }, + { + "epoch": 0.8378594249201278, + "grad_norm": 0.1618194878101349, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1049 + }, + { + "epoch": 0.8386581469648562, + "grad_norm": 0.2044777274131775, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1050 + }, + { + "epoch": 0.8394568690095847, + "grad_norm": 0.20439067482948303, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.8402555910543131, + "grad_norm": 0.1967901587486267, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1052 + }, + { + "epoch": 0.8410543130990416, + "grad_norm": 0.06829354166984558, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1053 + }, + { + "epoch": 0.84185303514377, + "grad_norm": 0.12168806046247482, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1054 + }, + { + "epoch": 0.8426517571884984, + "grad_norm": 0.23461978137493134, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1055 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.28916484117507935, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1056 + }, + { + "epoch": 0.8442492012779552, + "grad_norm": 0.21827733516693115, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1057 + }, + { + "epoch": 0.8450479233226837, + "grad_norm": 0.045396093279123306, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1058 + }, + { + "epoch": 0.8458466453674122, + "grad_norm": 0.2391543984413147, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1059 + }, + { + "epoch": 0.8466453674121406, + "grad_norm": 0.2916122078895569, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1060 + }, + { + "epoch": 0.847444089456869, + "grad_norm": 0.1589413434267044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1061 + }, + { + "epoch": 0.8482428115015974, + "grad_norm": 0.14869733154773712, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1062 + }, + { + "epoch": 0.8490415335463258, + "grad_norm": 0.3719956874847412, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 1063 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1064 + }, + { + "epoch": 0.8506389776357828, + "grad_norm": 0.22647641599178314, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1065 + }, + { + "epoch": 0.8514376996805112, + "grad_norm": 0.14329837262630463, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1066 + }, + { + "epoch": 0.8522364217252396, + "grad_norm": 0.2508337199687958, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1067 + }, + { + "epoch": 0.853035143769968, + "grad_norm": 0.16483807563781738, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1068 + }, + { + "epoch": 0.8538338658146964, + "grad_norm": 0.08231265842914581, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1069 + }, + { + "epoch": 0.854632587859425, + "grad_norm": 0.15707719326019287, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1070 + }, + { + "epoch": 0.8554313099041534, + "grad_norm": 0.1741408407688141, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1071 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.06281771510839462, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1072 + }, + { + "epoch": 0.8570287539936102, + "grad_norm": 0.10936494171619415, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1073 + }, + { + "epoch": 0.8578274760383386, + "grad_norm": 0.08680932223796844, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1074 + }, + { + "epoch": 0.8586261980830671, + "grad_norm": 0.05679824575781822, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1075 + }, + { + "epoch": 0.8594249201277955, + "grad_norm": 0.07635466009378433, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1076 + }, + { + "epoch": 0.860223642172524, + "grad_norm": 0.08391202241182327, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 1077 + }, + { + "epoch": 0.8610223642172524, + "grad_norm": 0.044910602271556854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1078 + }, + { + "epoch": 0.8618210862619808, + "grad_norm": 0.07833745330572128, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 1079 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 0.11653397232294083, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1080 + }, + { + "epoch": 0.8634185303514377, + "grad_norm": 0.09041672199964523, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1081 + }, + { + "epoch": 0.8642172523961661, + "grad_norm": 0.061735767871141434, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1082 + }, + { + "epoch": 0.8650159744408946, + "grad_norm": 0.042857520282268524, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1083 + }, + { + "epoch": 0.865814696485623, + "grad_norm": 0.040145136415958405, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1084 + }, + { + "epoch": 0.8666134185303515, + "grad_norm": 0.05785573646426201, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1085 + }, + { + "epoch": 0.8674121405750799, + "grad_norm": 0.13503877818584442, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1086 + }, + { + "epoch": 0.8682108626198083, + "grad_norm": 0.16243800520896912, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1087 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.13211014866828918, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1088 + }, + { + "epoch": 0.8698083067092651, + "grad_norm": 0.08136262744665146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1089 + }, + { + "epoch": 0.8706070287539937, + "grad_norm": 0.07881205528974533, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1090 + }, + { + "epoch": 0.8714057507987221, + "grad_norm": 0.1660437136888504, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1091 + }, + { + "epoch": 0.8722044728434505, + "grad_norm": 0.1955040693283081, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1092 + }, + { + "epoch": 0.8730031948881789, + "grad_norm": 0.18039803206920624, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1093 + }, + { + "epoch": 0.8738019169329073, + "grad_norm": 0.13832250237464905, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1094 + }, + { + "epoch": 0.8746006389776357, + "grad_norm": 0.06982281059026718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1095 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 0.06607141345739365, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1096 + }, + { + "epoch": 0.8761980830670927, + "grad_norm": 0.08685869723558426, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1097 + }, + { + "epoch": 0.8769968051118211, + "grad_norm": 0.09157849103212357, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1098 + }, + { + "epoch": 0.8777955271565495, + "grad_norm": 0.05980607122182846, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1099 + }, + { + "epoch": 0.8785942492012779, + "grad_norm": 0.05037426948547363, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1100 + }, + { + "epoch": 0.8793929712460063, + "grad_norm": 0.09998175501823425, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 1101 + }, + { + "epoch": 0.8801916932907349, + "grad_norm": 0.14255133271217346, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1102 + }, + { + "epoch": 0.8809904153354633, + "grad_norm": 0.1332579255104065, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1103 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.06453413516283035, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1104 + }, + { + "epoch": 0.8825878594249201, + "grad_norm": 0.07107783854007721, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1105 + }, + { + "epoch": 0.8833865814696485, + "grad_norm": 0.14025849103927612, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1106 + }, + { + "epoch": 0.884185303514377, + "grad_norm": 0.18791186809539795, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1107 + }, + { + "epoch": 0.8849840255591054, + "grad_norm": 0.228570356965065, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1108 + }, + { + "epoch": 0.8857827476038339, + "grad_norm": 0.21574346721172333, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1109 + }, + { + "epoch": 0.8865814696485623, + "grad_norm": 0.14833906292915344, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1110 + }, + { + "epoch": 0.8873801916932907, + "grad_norm": 0.04756765812635422, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1111 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 0.13023658096790314, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1112 + }, + { + "epoch": 0.8889776357827476, + "grad_norm": 0.21199558675289154, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1113 + }, + { + "epoch": 0.889776357827476, + "grad_norm": 0.19635719060897827, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1114 + }, + { + "epoch": 0.8905750798722045, + "grad_norm": 0.14753709733486176, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1115 + }, + { + "epoch": 0.8913738019169329, + "grad_norm": 0.06639572232961655, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1116 + }, + { + "epoch": 0.8921725239616614, + "grad_norm": 0.09707840532064438, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1117 + }, + { + "epoch": 0.8929712460063898, + "grad_norm": 0.20057998597621918, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 1118 + }, + { + "epoch": 0.8937699680511182, + "grad_norm": 0.232718825340271, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1119 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.16340196132659912, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1120 + }, + { + "epoch": 0.895367412140575, + "grad_norm": 0.04553915560245514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 1121 + }, + { + "epoch": 0.8961661341853036, + "grad_norm": 0.12561571598052979, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1122 + }, + { + "epoch": 0.896964856230032, + "grad_norm": 0.19254666566848755, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1123 + }, + { + "epoch": 0.8977635782747604, + "grad_norm": 0.12862572073936462, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1124 + }, + { + "epoch": 0.8985623003194888, + "grad_norm": 0.051237158477306366, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1125 + }, + { + "epoch": 0.8993610223642172, + "grad_norm": 0.18603810667991638, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1126 + }, + { + "epoch": 0.9001597444089456, + "grad_norm": 0.2498294860124588, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1127 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 0.18809954822063446, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1128 + }, + { + "epoch": 0.9017571884984026, + "grad_norm": 0.06116599217057228, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1129 + }, + { + "epoch": 0.902555910543131, + "grad_norm": 0.07710137963294983, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1130 + }, + { + "epoch": 0.9033546325878594, + "grad_norm": 0.11208303272724152, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1131 + }, + { + "epoch": 0.9041533546325878, + "grad_norm": 0.11864814907312393, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1132 + }, + { + "epoch": 0.9049520766773163, + "grad_norm": 0.1261119246482849, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1133 + }, + { + "epoch": 0.9057507987220448, + "grad_norm": 0.10841526836156845, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1134 + }, + { + "epoch": 0.9065495207667732, + "grad_norm": 0.04871276393532753, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1135 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.08953645080327988, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1136 + }, + { + "epoch": 0.90814696485623, + "grad_norm": 0.1590365469455719, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1137 + }, + { + "epoch": 0.9089456869009584, + "grad_norm": 0.155691459774971, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1138 + }, + { + "epoch": 0.9097444089456869, + "grad_norm": 0.09982484579086304, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1139 + }, + { + "epoch": 0.9105431309904153, + "grad_norm": 0.08257611095905304, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1140 + }, + { + "epoch": 0.9113418530351438, + "grad_norm": 0.1036139577627182, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1141 + }, + { + "epoch": 0.9121405750798722, + "grad_norm": 0.06543707102537155, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1142 + }, + { + "epoch": 0.9129392971246006, + "grad_norm": 0.05375903844833374, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1143 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 0.13674795627593994, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1144 + }, + { + "epoch": 0.9145367412140575, + "grad_norm": 0.21575352549552917, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 1145 + }, + { + "epoch": 0.9153354632587859, + "grad_norm": 0.22478559613227844, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1146 + }, + { + "epoch": 0.9161341853035144, + "grad_norm": 0.1854555904865265, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1147 + }, + { + "epoch": 0.9169329073482428, + "grad_norm": 0.08605340123176575, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1148 + }, + { + "epoch": 0.9177316293929713, + "grad_norm": 0.14082656800746918, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1149 + }, + { + "epoch": 0.9185303514376997, + "grad_norm": 0.3214903771877289, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1150 + }, + { + "epoch": 0.9193290734824281, + "grad_norm": 0.4360012412071228, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 1151 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.3582250773906708, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1152 + }, + { + "epoch": 0.920926517571885, + "grad_norm": 0.1142783984541893, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1153 + }, + { + "epoch": 0.9217252396166135, + "grad_norm": 0.2035343497991562, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1154 + }, + { + "epoch": 0.9225239616613419, + "grad_norm": 0.3506172299385071, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1155 + }, + { + "epoch": 0.9233226837060703, + "grad_norm": 0.2129906564950943, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1156 + }, + { + "epoch": 0.9241214057507987, + "grad_norm": 0.12158108502626419, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1157 + }, + { + "epoch": 0.9249201277955271, + "grad_norm": 0.3931717872619629, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1158 + }, + { + "epoch": 0.9257188498402555, + "grad_norm": 0.36336907744407654, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1159 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 0.06781382113695145, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1160 + }, + { + "epoch": 0.9273162939297125, + "grad_norm": 0.3335910141468048, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1161 + }, + { + "epoch": 0.9281150159744409, + "grad_norm": 0.5017055869102478, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 1162 + }, + { + "epoch": 0.9289137380191693, + "grad_norm": 0.3635455071926117, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1163 + }, + { + "epoch": 0.9297124600638977, + "grad_norm": 0.06748906522989273, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1164 + }, + { + "epoch": 0.9305111821086262, + "grad_norm": 0.3723882734775543, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1165 + }, + { + "epoch": 0.9313099041533547, + "grad_norm": 0.2976631820201874, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1166 + }, + { + "epoch": 0.9321086261980831, + "grad_norm": 0.06998804211616516, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1167 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.3307324945926666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1168 + }, + { + "epoch": 0.9337060702875399, + "grad_norm": 0.29726436734199524, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1169 + }, + { + "epoch": 0.9345047923322684, + "grad_norm": 0.048596691340208054, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1170 + }, + { + "epoch": 0.9353035143769968, + "grad_norm": 0.2840823233127594, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1171 + }, + { + "epoch": 0.9361022364217252, + "grad_norm": 0.31426292657852173, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1172 + }, + { + "epoch": 0.9369009584664537, + "grad_norm": 0.16073261201381683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1173 + }, + { + "epoch": 0.9376996805111821, + "grad_norm": 0.05725392326712608, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1174 + }, + { + "epoch": 0.9384984025559105, + "grad_norm": 0.1674586981534958, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1175 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 0.13738949596881866, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1176 + }, + { + "epoch": 0.9400958466453674, + "grad_norm": 0.05350235849618912, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1177 + }, + { + "epoch": 0.9408945686900958, + "grad_norm": 0.10518805682659149, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.9416932907348243, + "grad_norm": 0.11264974623918533, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1179 + }, + { + "epoch": 0.9424920127795527, + "grad_norm": 0.06757227331399918, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1180 + }, + { + "epoch": 0.9432907348242812, + "grad_norm": 0.07214303314685822, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1181 + }, + { + "epoch": 0.9440894568690096, + "grad_norm": 0.12705406546592712, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1182 + }, + { + "epoch": 0.944888178913738, + "grad_norm": 0.09937570244073868, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 1183 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 0.05628623813390732, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1184 + }, + { + "epoch": 0.9464856230031949, + "grad_norm": 0.05685505270957947, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1185 + }, + { + "epoch": 0.9472843450479234, + "grad_norm": 0.06150783598423004, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1186 + }, + { + "epoch": 0.9480830670926518, + "grad_norm": 0.04247362166643143, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1187 + }, + { + "epoch": 0.9488817891373802, + "grad_norm": 0.05664962902665138, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1188 + }, + { + "epoch": 0.9496805111821086, + "grad_norm": 0.07421324402093887, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1189 + }, + { + "epoch": 0.950479233226837, + "grad_norm": 0.043645020574331284, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1190 + }, + { + "epoch": 0.9512779552715654, + "grad_norm": 0.0692208856344223, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1191 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 0.13804891705513, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1192 + }, + { + "epoch": 0.9528753993610224, + "grad_norm": 0.14874884486198425, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1193 + }, + { + "epoch": 0.9536741214057508, + "grad_norm": 0.08449128270149231, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1194 + }, + { + "epoch": 0.9544728434504792, + "grad_norm": 0.035032968968153, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1195 + }, + { + "epoch": 0.9552715654952076, + "grad_norm": 0.10837965458631516, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1196 + }, + { + "epoch": 0.9560702875399361, + "grad_norm": 0.17972581088542938, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1197 + }, + { + "epoch": 0.9568690095846646, + "grad_norm": 0.17075787484645844, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1198 + }, + { + "epoch": 0.957667731629393, + "grad_norm": 0.08269231766462326, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1199 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.07269515842199326, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1200 + }, + { + "epoch": 0.9592651757188498, + "grad_norm": 0.15345947444438934, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1201 + }, + { + "epoch": 0.9600638977635783, + "grad_norm": 0.19025452435016632, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1202 + }, + { + "epoch": 0.9608626198083067, + "grad_norm": 0.1782686710357666, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1203 + }, + { + "epoch": 0.9616613418530351, + "grad_norm": 0.1296931356191635, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1204 + }, + { + "epoch": 0.9624600638977636, + "grad_norm": 0.036208219826221466, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1205 + }, + { + "epoch": 0.963258785942492, + "grad_norm": 0.14282052218914032, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1206 + }, + { + "epoch": 0.9640575079872205, + "grad_norm": 0.26539498567581177, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1207 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 0.28352224826812744, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1208 + }, + { + "epoch": 0.9656549520766773, + "grad_norm": 0.14476369321346283, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1209 + }, + { + "epoch": 0.9664536741214057, + "grad_norm": 0.06859725713729858, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1210 + }, + { + "epoch": 0.9672523961661342, + "grad_norm": 0.19093726575374603, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1211 + }, + { + "epoch": 0.9680511182108626, + "grad_norm": 0.1848185807466507, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1212 + }, + { + "epoch": 0.9688498402555911, + "grad_norm": 0.05829976871609688, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1213 + }, + { + "epoch": 0.9696485623003195, + "grad_norm": 0.10105405002832413, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1214 + }, + { + "epoch": 0.9704472843450479, + "grad_norm": 0.12762011587619781, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1215 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.08238376677036285, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1216 + }, + { + "epoch": 0.9720447284345048, + "grad_norm": 0.07039444148540497, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1217 + }, + { + "epoch": 0.9728434504792333, + "grad_norm": 0.1320599615573883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1218 + }, + { + "epoch": 0.9736421725239617, + "grad_norm": 0.07799404859542847, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1219 + }, + { + "epoch": 0.9744408945686901, + "grad_norm": 0.11601961404085159, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1220 + }, + { + "epoch": 0.9752396166134185, + "grad_norm": 0.26134374737739563, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1221 + }, + { + "epoch": 0.9760383386581469, + "grad_norm": 0.275513231754303, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1222 + }, + { + "epoch": 0.9768370607028753, + "grad_norm": 0.0711631178855896, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1223 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 0.1879139244556427, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1224 + }, + { + "epoch": 0.9784345047923323, + "grad_norm": 0.24822647869586945, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1225 + }, + { + "epoch": 0.9792332268370607, + "grad_norm": 0.1244853138923645, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1226 + }, + { + "epoch": 0.9800319488817891, + "grad_norm": 0.07694529742002487, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1227 + }, + { + "epoch": 0.9808306709265175, + "grad_norm": 0.1280626803636551, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1228 + }, + { + "epoch": 0.981629392971246, + "grad_norm": 0.09127703309059143, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1229 + }, + { + "epoch": 0.9824281150159745, + "grad_norm": 0.06747932732105255, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1230 + }, + { + "epoch": 0.9832268370607029, + "grad_norm": 0.08196533471345901, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1231 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.09074689447879791, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1232 + }, + { + "epoch": 0.9848242811501597, + "grad_norm": 0.06031282991170883, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1233 + }, + { + "epoch": 0.9856230031948882, + "grad_norm": 0.07138215005397797, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1234 + }, + { + "epoch": 0.9864217252396166, + "grad_norm": 0.11056806892156601, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1235 + }, + { + "epoch": 0.987220447284345, + "grad_norm": 0.09108638018369675, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1236 + }, + { + "epoch": 0.9880191693290735, + "grad_norm": 0.0515020377933979, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1237 + }, + { + "epoch": 0.9888178913738019, + "grad_norm": 0.08467873930931091, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1238 + }, + { + "epoch": 0.9896166134185304, + "grad_norm": 0.10424523055553436, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1239 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 0.11506868153810501, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1240 + }, + { + "epoch": 0.9912140575079872, + "grad_norm": 0.13226476311683655, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1241 + }, + { + "epoch": 0.9920127795527156, + "grad_norm": 0.13714630901813507, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1242 + }, + { + "epoch": 0.9928115015974441, + "grad_norm": 0.08985403180122375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1243 + }, + { + "epoch": 0.9936102236421726, + "grad_norm": 0.1107666939496994, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 1244 + }, + { + "epoch": 0.994408945686901, + "grad_norm": 0.130653515458107, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1245 + }, + { + "epoch": 0.9952076677316294, + "grad_norm": 0.10675778985023499, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1246 + }, + { + "epoch": 0.9960063897763578, + "grad_norm": 0.042045243084430695, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1247 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.07957674562931061, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1248 + }, + { + "epoch": 0.9976038338658147, + "grad_norm": 0.06926224380731583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1249 + }, + { + "epoch": 0.9984025559105432, + "grad_norm": 0.0849846750497818, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1250 + }, + { + "epoch": 0.9992012779552716, + "grad_norm": 0.12501482665538788, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1251 + }, + { + "epoch": 1.0, + "grad_norm": 0.1467234194278717, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1252 + }, + { + "epoch": 1.0007987220447285, + "grad_norm": 0.11206725984811783, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1253 + }, + { + "epoch": 1.0015974440894568, + "grad_norm": 0.05224297568202019, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1254 + }, + { + "epoch": 1.0023961661341854, + "grad_norm": 0.15176911652088165, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1255 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.22419261932373047, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1256 + }, + { + "epoch": 1.0039936102236422, + "grad_norm": 0.18444369733333588, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1257 + }, + { + "epoch": 1.0047923322683705, + "grad_norm": 0.06510337442159653, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1258 + }, + { + "epoch": 1.005591054313099, + "grad_norm": 0.16058789193630219, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1259 + }, + { + "epoch": 1.0063897763578276, + "grad_norm": 0.22726313769817352, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1260 + }, + { + "epoch": 1.0071884984025559, + "grad_norm": 0.21050630509853363, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1261 + }, + { + "epoch": 1.0079872204472844, + "grad_norm": 0.09227188676595688, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1262 + }, + { + "epoch": 1.0087859424920127, + "grad_norm": 0.11473584920167923, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 1263 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 0.12692919373512268, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1264 + }, + { + "epoch": 1.0103833865814698, + "grad_norm": 0.056371819227933884, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1265 + }, + { + "epoch": 1.011182108626198, + "grad_norm": 0.13166245818138123, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1266 + }, + { + "epoch": 1.0119808306709266, + "grad_norm": 0.2606523633003235, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1267 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 0.320832759141922, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1268 + }, + { + "epoch": 1.0135782747603834, + "grad_norm": 0.2074427455663681, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1269 + }, + { + "epoch": 1.0143769968051117, + "grad_norm": 0.05768958851695061, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1270 + }, + { + "epoch": 1.0151757188498403, + "grad_norm": 0.08107002079486847, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1271 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 0.12996292114257812, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1272 + }, + { + "epoch": 1.016773162939297, + "grad_norm": 0.1514650285243988, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1273 + }, + { + "epoch": 1.0175718849840256, + "grad_norm": 0.1007395088672638, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1274 + }, + { + "epoch": 1.018370607028754, + "grad_norm": 0.0831306204199791, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1275 + }, + { + "epoch": 1.0191693290734825, + "grad_norm": 0.09004336595535278, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1276 + }, + { + "epoch": 1.0199680511182108, + "grad_norm": 0.06632232666015625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1277 + }, + { + "epoch": 1.0207667731629393, + "grad_norm": 0.05073424428701401, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1278 + }, + { + "epoch": 1.0215654952076678, + "grad_norm": 0.06486333161592484, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 1279 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 0.1137472614645958, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1280 + }, + { + "epoch": 1.0231629392971247, + "grad_norm": 0.08062250912189484, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1281 + }, + { + "epoch": 1.023961661341853, + "grad_norm": 0.05046350136399269, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1282 + }, + { + "epoch": 1.0247603833865815, + "grad_norm": 0.06503880023956299, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1283 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 0.10730332881212234, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1284 + }, + { + "epoch": 1.0263578274760383, + "grad_norm": 0.12077611684799194, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1285 + }, + { + "epoch": 1.0271565495207668, + "grad_norm": 0.15061219036579132, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1286 + }, + { + "epoch": 1.0279552715654952, + "grad_norm": 0.15091058611869812, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1287 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 0.07299874722957611, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1288 + }, + { + "epoch": 1.029552715654952, + "grad_norm": 0.09598413854837418, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1289 + }, + { + "epoch": 1.0303514376996805, + "grad_norm": 0.21661055088043213, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 1290 + }, + { + "epoch": 1.031150159744409, + "grad_norm": 0.24777255952358246, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1291 + }, + { + "epoch": 1.0319488817891374, + "grad_norm": 0.17097236216068268, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1292 + }, + { + "epoch": 1.0327476038338659, + "grad_norm": 0.05266748368740082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1293 + }, + { + "epoch": 1.0335463258785942, + "grad_norm": 0.12484195083379745, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1294 + }, + { + "epoch": 1.0343450479233227, + "grad_norm": 0.1802505999803543, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1295 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 0.10778877139091492, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1296 + }, + { + "epoch": 1.0359424920127795, + "grad_norm": 0.046645063906908035, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1297 + }, + { + "epoch": 1.036741214057508, + "grad_norm": 0.11727745085954666, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1298 + }, + { + "epoch": 1.0375399361022364, + "grad_norm": 0.1356390118598938, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1299 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 0.08130940794944763, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1300 + }, + { + "epoch": 1.0391373801916932, + "grad_norm": 0.07274319976568222, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1301 + }, + { + "epoch": 1.0399361022364217, + "grad_norm": 0.20339541137218475, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1302 + }, + { + "epoch": 1.04073482428115, + "grad_norm": 0.27819424867630005, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 1303 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 0.25879770517349243, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1304 + }, + { + "epoch": 1.042332268370607, + "grad_norm": 0.12683863937854767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1305 + }, + { + "epoch": 1.0431309904153354, + "grad_norm": 0.13531504571437836, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1306 + }, + { + "epoch": 1.043929712460064, + "grad_norm": 0.3203699588775635, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1307 + }, + { + "epoch": 1.0447284345047922, + "grad_norm": 0.3073630630970001, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1308 + }, + { + "epoch": 1.0455271565495208, + "grad_norm": 0.13184015452861786, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1309 + }, + { + "epoch": 1.0463258785942493, + "grad_norm": 0.1311715543270111, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1310 + }, + { + "epoch": 1.0471246006389776, + "grad_norm": 0.24470581114292145, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1311 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 0.21901719272136688, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1312 + }, + { + "epoch": 1.0487220447284344, + "grad_norm": 0.08105460554361343, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1313 + }, + { + "epoch": 1.049520766773163, + "grad_norm": 0.14864705502986908, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1314 + }, + { + "epoch": 1.0503194888178913, + "grad_norm": 0.20006732642650604, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1315 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 0.06233162060379982, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1316 + }, + { + "epoch": 1.0519169329073483, + "grad_norm": 0.12691672146320343, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1317 + }, + { + "epoch": 1.0527156549520766, + "grad_norm": 0.18303292989730835, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1318 + }, + { + "epoch": 1.0535143769968052, + "grad_norm": 0.13289928436279297, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1319 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 0.03847618028521538, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1320 + }, + { + "epoch": 1.055111821086262, + "grad_norm": 0.1317387968301773, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1321 + }, + { + "epoch": 1.0559105431309903, + "grad_norm": 0.1663348227739334, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1322 + }, + { + "epoch": 1.0567092651757188, + "grad_norm": 0.0657038614153862, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1323 + }, + { + "epoch": 1.0575079872204474, + "grad_norm": 0.1484680026769638, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1324 + }, + { + "epoch": 1.0583067092651757, + "grad_norm": 0.299824595451355, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1325 + }, + { + "epoch": 1.0591054313099042, + "grad_norm": 0.3598216772079468, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1326 + }, + { + "epoch": 1.0599041533546325, + "grad_norm": 0.25792455673217773, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1327 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 0.04925544187426567, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1328 + }, + { + "epoch": 1.0615015974440896, + "grad_norm": 0.2568669319152832, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1329 + }, + { + "epoch": 1.0623003194888179, + "grad_norm": 0.2679016590118408, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1330 + }, + { + "epoch": 1.0630990415335464, + "grad_norm": 0.12100119888782501, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1331 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 0.17324721813201904, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1332 + }, + { + "epoch": 1.0646964856230032, + "grad_norm": 0.34452658891677856, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1333 + }, + { + "epoch": 1.0654952076677315, + "grad_norm": 0.24561382830142975, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1334 + }, + { + "epoch": 1.06629392971246, + "grad_norm": 0.06080634891986847, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1335 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 0.249319925904274, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1336 + }, + { + "epoch": 1.067891373801917, + "grad_norm": 0.2586004436016083, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1337 + }, + { + "epoch": 1.0686900958466454, + "grad_norm": 0.07297322154045105, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1338 + }, + { + "epoch": 1.0694888178913737, + "grad_norm": 0.20853886008262634, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1339 + }, + { + "epoch": 1.0702875399361023, + "grad_norm": 0.3214154541492462, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1340 + }, + { + "epoch": 1.0710862619808306, + "grad_norm": 0.16169136762619019, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1341 + }, + { + "epoch": 1.071884984025559, + "grad_norm": 0.18989364802837372, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1342 + }, + { + "epoch": 1.0726837060702876, + "grad_norm": 0.42826735973358154, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1343 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 0.35387369990348816, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1344 + }, + { + "epoch": 1.0742811501597445, + "grad_norm": 0.061617862433195114, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 1345 + }, + { + "epoch": 1.0750798722044728, + "grad_norm": 0.3348129987716675, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1346 + }, + { + "epoch": 1.0758785942492013, + "grad_norm": 0.3622291088104248, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1347 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 0.12743668258190155, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1348 + }, + { + "epoch": 1.0774760383386581, + "grad_norm": 0.2464202642440796, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1349 + }, + { + "epoch": 1.0782747603833867, + "grad_norm": 0.3873802423477173, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1350 + }, + { + "epoch": 1.079073482428115, + "grad_norm": 0.22619839012622833, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1351 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 0.09080081433057785, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 1352 + }, + { + "epoch": 1.0806709265175718, + "grad_norm": 0.31380224227905273, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1353 + }, + { + "epoch": 1.0814696485623003, + "grad_norm": 0.2782067060470581, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1354 + }, + { + "epoch": 1.0822683706070289, + "grad_norm": 0.04267412796616554, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1355 + }, + { + "epoch": 1.0830670926517572, + "grad_norm": 0.2687273919582367, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1356 + }, + { + "epoch": 1.0838658146964857, + "grad_norm": 0.3133341073989868, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1357 + }, + { + "epoch": 1.084664536741214, + "grad_norm": 0.11658725887537003, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1358 + }, + { + "epoch": 1.0854632587859425, + "grad_norm": 0.1339937299489975, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1359 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 0.15727631747722626, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1360 + }, + { + "epoch": 1.0870607028753994, + "grad_norm": 0.11759792268276215, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1361 + }, + { + "epoch": 1.0878594249201279, + "grad_norm": 0.11522746086120605, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1362 + }, + { + "epoch": 1.0886581469648562, + "grad_norm": 0.16571135818958282, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1363 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 0.09467484056949615, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1364 + }, + { + "epoch": 1.090255591054313, + "grad_norm": 0.07887586951255798, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1365 + }, + { + "epoch": 1.0910543130990416, + "grad_norm": 0.11297929286956787, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1366 + }, + { + "epoch": 1.09185303514377, + "grad_norm": 0.06402980536222458, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1367 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 0.11947043240070343, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1368 + }, + { + "epoch": 1.093450479233227, + "grad_norm": 0.06244207173585892, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1369 + }, + { + "epoch": 1.0942492012779552, + "grad_norm": 0.08165531605482101, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1370 + }, + { + "epoch": 1.0950479233226837, + "grad_norm": 0.03842553123831749, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 1371 + }, + { + "epoch": 1.095846645367412, + "grad_norm": 0.12175651639699936, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1372 + }, + { + "epoch": 1.0966453674121406, + "grad_norm": 0.1720212697982788, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1373 + }, + { + "epoch": 1.097444089456869, + "grad_norm": 0.15540143847465515, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1374 + }, + { + "epoch": 1.0982428115015974, + "grad_norm": 0.1056036502122879, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1375 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 0.06738443672657013, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1376 + }, + { + "epoch": 1.0998402555910542, + "grad_norm": 0.09600193798542023, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1377 + }, + { + "epoch": 1.1006389776357828, + "grad_norm": 0.11872005462646484, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1378 + }, + { + "epoch": 1.101437699680511, + "grad_norm": 0.04837389290332794, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1379 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 0.11245802789926529, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1380 + }, + { + "epoch": 1.1030351437699681, + "grad_norm": 0.1525758057832718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1381 + }, + { + "epoch": 1.1038338658146964, + "grad_norm": 0.07688060402870178, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1382 + }, + { + "epoch": 1.104632587859425, + "grad_norm": 0.05793362855911255, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1383 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 0.09737680107355118, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1384 + }, + { + "epoch": 1.1062300319488818, + "grad_norm": 0.15511851012706757, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1385 + }, + { + "epoch": 1.1070287539936103, + "grad_norm": 0.14931945502758026, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1386 + }, + { + "epoch": 1.1078274760383386, + "grad_norm": 0.1451406478881836, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1387 + }, + { + "epoch": 1.1086261980830672, + "grad_norm": 0.06013273820281029, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1388 + }, + { + "epoch": 1.1094249201277955, + "grad_norm": 0.08433987945318222, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1389 + }, + { + "epoch": 1.110223642172524, + "grad_norm": 0.12601709365844727, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1390 + }, + { + "epoch": 1.1110223642172523, + "grad_norm": 0.14611507952213287, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1391 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 0.10526898503303528, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1392 + }, + { + "epoch": 1.1126198083067094, + "grad_norm": 0.03592250496149063, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1393 + }, + { + "epoch": 1.1134185303514377, + "grad_norm": 0.07883994281291962, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1394 + }, + { + "epoch": 1.1142172523961662, + "grad_norm": 0.1351863145828247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1395 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 0.10423804074525833, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1396 + }, + { + "epoch": 1.115814696485623, + "grad_norm": 0.05230586603283882, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1397 + }, + { + "epoch": 1.1166134185303513, + "grad_norm": 0.03962033987045288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1398 + }, + { + "epoch": 1.1174121405750799, + "grad_norm": 0.08950864523649216, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1399 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.1326761394739151, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1400 + }, + { + "epoch": 1.1190095846645367, + "grad_norm": 0.1251986175775528, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1401 + }, + { + "epoch": 1.1198083067092652, + "grad_norm": 0.05831597000360489, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1402 + }, + { + "epoch": 1.1206070287539935, + "grad_norm": 0.11382800340652466, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1403 + }, + { + "epoch": 1.121405750798722, + "grad_norm": 0.16290108859539032, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1404 + }, + { + "epoch": 1.1222044728434506, + "grad_norm": 0.1721554696559906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1405 + }, + { + "epoch": 1.123003194888179, + "grad_norm": 0.09426763653755188, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1406 + }, + { + "epoch": 1.1238019169329074, + "grad_norm": 0.037366580218076706, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1407 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 0.07456237077713013, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1408 + }, + { + "epoch": 1.1253993610223643, + "grad_norm": 0.11701856553554535, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1409 + }, + { + "epoch": 1.1261980830670926, + "grad_norm": 0.13261918723583221, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1410 + }, + { + "epoch": 1.126996805111821, + "grad_norm": 0.09014345705509186, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1411 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 0.05398619920015335, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1412 + }, + { + "epoch": 1.128594249201278, + "grad_norm": 0.09375960379838943, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1413 + }, + { + "epoch": 1.1293929712460065, + "grad_norm": 0.09307628124952316, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1414 + }, + { + "epoch": 1.1301916932907348, + "grad_norm": 0.09488195180892944, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1415 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 0.08067089319229126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1416 + }, + { + "epoch": 1.1317891373801916, + "grad_norm": 0.043899055570364, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1417 + }, + { + "epoch": 1.1325878594249201, + "grad_norm": 0.05593986064195633, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1418 + }, + { + "epoch": 1.1333865814696487, + "grad_norm": 0.05736452341079712, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1419 + }, + { + "epoch": 1.134185303514377, + "grad_norm": 0.1092999204993248, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1420 + }, + { + "epoch": 1.1349840255591055, + "grad_norm": 0.18366938829421997, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1421 + }, + { + "epoch": 1.1357827476038338, + "grad_norm": 0.177176833152771, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1422 + }, + { + "epoch": 1.1365814696485623, + "grad_norm": 0.08829191327095032, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1423 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 0.07169382274150848, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1424 + }, + { + "epoch": 1.1381789137380192, + "grad_norm": 0.130388081073761, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1425 + }, + { + "epoch": 1.1389776357827477, + "grad_norm": 0.20726168155670166, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1426 + }, + { + "epoch": 1.139776357827476, + "grad_norm": 0.21683751046657562, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1427 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 0.131125345826149, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1428 + }, + { + "epoch": 1.1413738019169328, + "grad_norm": 0.04309925064444542, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1429 + }, + { + "epoch": 1.1421725239616614, + "grad_norm": 0.14427928626537323, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1430 + }, + { + "epoch": 1.1429712460063897, + "grad_norm": 0.1743481606245041, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1431 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 0.1037210002541542, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1432 + }, + { + "epoch": 1.1445686900958467, + "grad_norm": 0.11162228137254715, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1433 + }, + { + "epoch": 1.145367412140575, + "grad_norm": 0.25445371866226196, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1434 + }, + { + "epoch": 1.1461661341853036, + "grad_norm": 0.2771884799003601, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1435 + }, + { + "epoch": 1.1469648562300319, + "grad_norm": 0.10653509199619293, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1436 + }, + { + "epoch": 1.1477635782747604, + "grad_norm": 0.1745259016752243, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1437 + }, + { + "epoch": 1.148562300319489, + "grad_norm": 0.3151826560497284, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1438 + }, + { + "epoch": 1.1493610223642172, + "grad_norm": 0.23229722678661346, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1439 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 0.06131701543927193, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1440 + }, + { + "epoch": 1.150958466453674, + "grad_norm": 0.28753313422203064, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1441 + }, + { + "epoch": 1.1517571884984026, + "grad_norm": 0.3178791105747223, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1442 + }, + { + "epoch": 1.1525559105431311, + "grad_norm": 0.10008880496025085, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1443 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 0.2418096512556076, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1444 + }, + { + "epoch": 1.154153354632588, + "grad_norm": 0.34728583693504333, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1445 + }, + { + "epoch": 1.1549520766773163, + "grad_norm": 0.2172212153673172, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1446 + }, + { + "epoch": 1.1557507987220448, + "grad_norm": 0.04184277728199959, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1447 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 0.19960719347000122, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1448 + }, + { + "epoch": 1.1573482428115016, + "grad_norm": 0.19261692464351654, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1449 + }, + { + "epoch": 1.15814696485623, + "grad_norm": 0.08326124399900436, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1450 + }, + { + "epoch": 1.1589456869009584, + "grad_norm": 0.08552456647157669, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1451 + }, + { + "epoch": 1.159744408945687, + "grad_norm": 0.07903868705034256, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 1452 + }, + { + "epoch": 1.1605431309904153, + "grad_norm": 0.045095205307006836, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1453 + }, + { + "epoch": 1.1613418530351438, + "grad_norm": 0.08293266594409943, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1454 + }, + { + "epoch": 1.1621405750798721, + "grad_norm": 0.09431439638137817, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1455 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 0.04189104586839676, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1456 + }, + { + "epoch": 1.1637380191693292, + "grad_norm": 0.11492408066987991, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1457 + }, + { + "epoch": 1.1645367412140575, + "grad_norm": 0.16648449003696442, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1458 + }, + { + "epoch": 1.165335463258786, + "grad_norm": 0.1532576084136963, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1459 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 0.07438737154006958, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1460 + }, + { + "epoch": 1.1669329073482428, + "grad_norm": 0.0887872502207756, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 1461 + }, + { + "epoch": 1.1677316293929714, + "grad_norm": 0.17035096883773804, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1462 + }, + { + "epoch": 1.1685303514376997, + "grad_norm": 0.12702526152133942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1463 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 0.04788994789123535, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1464 + }, + { + "epoch": 1.1701277955271565, + "grad_norm": 0.15093912184238434, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1465 + }, + { + "epoch": 1.170926517571885, + "grad_norm": 0.1428089439868927, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1466 + }, + { + "epoch": 1.1717252396166133, + "grad_norm": 0.039421554654836655, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1467 + }, + { + "epoch": 1.1725239616613419, + "grad_norm": 0.09461840242147446, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1468 + }, + { + "epoch": 1.1733226837060702, + "grad_norm": 0.07272787392139435, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1469 + }, + { + "epoch": 1.1741214057507987, + "grad_norm": 0.10863790661096573, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1470 + }, + { + "epoch": 1.1749201277955272, + "grad_norm": 0.211805522441864, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1471 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 0.2124311476945877, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1472 + }, + { + "epoch": 1.176517571884984, + "grad_norm": 0.14013712108135223, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1473 + }, + { + "epoch": 1.1773162939297124, + "grad_norm": 0.10768178105354309, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1474 + }, + { + "epoch": 1.178115015974441, + "grad_norm": 0.07961699366569519, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1475 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 0.0772516280412674, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1476 + }, + { + "epoch": 1.1797124600638977, + "grad_norm": 0.11957084387540817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1477 + }, + { + "epoch": 1.1805111821086263, + "grad_norm": 0.1976107954978943, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1478 + }, + { + "epoch": 1.1813099041533546, + "grad_norm": 0.20915871858596802, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1479 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 0.10857495665550232, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1480 + }, + { + "epoch": 1.1829073482428114, + "grad_norm": 0.09961260855197906, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1481 + }, + { + "epoch": 1.18370607028754, + "grad_norm": 0.11908663064241409, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1482 + }, + { + "epoch": 1.1845047923322685, + "grad_norm": 0.0982719212770462, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1483 + }, + { + "epoch": 1.1853035143769968, + "grad_norm": 0.05869903787970543, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1484 + }, + { + "epoch": 1.1861022364217253, + "grad_norm": 0.14943145215511322, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1485 + }, + { + "epoch": 1.1869009584664536, + "grad_norm": 0.1761479526758194, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1486 + }, + { + "epoch": 1.1876996805111821, + "grad_norm": 0.1393168866634369, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1487 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 0.0473988801240921, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1488 + }, + { + "epoch": 1.189297124600639, + "grad_norm": 0.20789027214050293, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1489 + }, + { + "epoch": 1.1900958466453675, + "grad_norm": 0.29456260800361633, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1490 + }, + { + "epoch": 1.1908945686900958, + "grad_norm": 0.1875244528055191, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1491 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 0.052052468061447144, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1492 + }, + { + "epoch": 1.1924920127795526, + "grad_norm": 0.1376652717590332, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1493 + }, + { + "epoch": 1.1932907348242812, + "grad_norm": 0.1656588762998581, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1494 + }, + { + "epoch": 1.1940894568690097, + "grad_norm": 0.07063707709312439, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1495 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 0.12681347131729126, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1496 + }, + { + "epoch": 1.1956869009584665, + "grad_norm": 0.17560099065303802, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1497 + }, + { + "epoch": 1.1964856230031948, + "grad_norm": 0.10635025054216385, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1498 + }, + { + "epoch": 1.1972843450479234, + "grad_norm": 0.061567965894937515, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1499 + }, + { + "epoch": 1.1980830670926517, + "grad_norm": 0.12346719950437546, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1500 + }, + { + "epoch": 1.1988817891373802, + "grad_norm": 0.07105513662099838, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1501 + }, + { + "epoch": 1.1996805111821087, + "grad_norm": 0.07719466835260391, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1502 + }, + { + "epoch": 1.200479233226837, + "grad_norm": 0.1478763371706009, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1503 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 0.1383642554283142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1504 + }, + { + "epoch": 1.2020766773162939, + "grad_norm": 0.05519767478108406, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1505 + }, + { + "epoch": 1.2028753993610224, + "grad_norm": 0.06807537376880646, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1506 + }, + { + "epoch": 1.2036741214057507, + "grad_norm": 0.10652226209640503, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1507 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 0.044540517032146454, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1508 + }, + { + "epoch": 1.2052715654952078, + "grad_norm": 0.12266546487808228, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1509 + }, + { + "epoch": 1.206070287539936, + "grad_norm": 0.1997641921043396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1510 + }, + { + "epoch": 1.2068690095846646, + "grad_norm": 0.1924593299627304, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1511 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 0.09990391880273819, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1512 + }, + { + "epoch": 1.2084664536741214, + "grad_norm": 0.04226391762495041, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1513 + }, + { + "epoch": 1.20926517571885, + "grad_norm": 0.07116132974624634, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1514 + }, + { + "epoch": 1.2100638977635783, + "grad_norm": 0.046046894043684006, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1515 + }, + { + "epoch": 1.2108626198083068, + "grad_norm": 0.039608217775821686, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1516 + }, + { + "epoch": 1.211661341853035, + "grad_norm": 0.055937573313713074, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1517 + }, + { + "epoch": 1.2124600638977636, + "grad_norm": 0.09269243478775024, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1518 + }, + { + "epoch": 1.213258785942492, + "grad_norm": 0.04349381849169731, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1519 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 0.08543939888477325, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1520 + }, + { + "epoch": 1.2148562300319488, + "grad_norm": 0.1829536110162735, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1521 + }, + { + "epoch": 1.2156549520766773, + "grad_norm": 0.23422624170780182, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1522 + }, + { + "epoch": 1.2164536741214058, + "grad_norm": 0.13391408324241638, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1523 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 0.07262124121189117, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1524 + }, + { + "epoch": 1.2180511182108626, + "grad_norm": 0.1842898577451706, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1525 + }, + { + "epoch": 1.218849840255591, + "grad_norm": 0.16982080042362213, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1526 + }, + { + "epoch": 1.2196485623003195, + "grad_norm": 0.07628878951072693, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1527 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 0.07903175801038742, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1528 + }, + { + "epoch": 1.2212460063897763, + "grad_norm": 0.1874074637889862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1529 + }, + { + "epoch": 1.2220447284345048, + "grad_norm": 0.2084639072418213, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1530 + }, + { + "epoch": 1.2228434504792332, + "grad_norm": 0.161276176571846, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1531 + }, + { + "epoch": 1.2236421725239617, + "grad_norm": 0.07408371567726135, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1532 + }, + { + "epoch": 1.2244408945686902, + "grad_norm": 0.06918113678693771, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1533 + }, + { + "epoch": 1.2252396166134185, + "grad_norm": 0.15813148021697998, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1534 + }, + { + "epoch": 1.226038338658147, + "grad_norm": 0.1454530507326126, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 1535 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 0.07441768050193787, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1536 + }, + { + "epoch": 1.2276357827476039, + "grad_norm": 0.19151917099952698, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1537 + }, + { + "epoch": 1.2284345047923322, + "grad_norm": 0.22358526289463043, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1538 + }, + { + "epoch": 1.2292332268370607, + "grad_norm": 0.12382426857948303, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1539 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 0.09593929350376129, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1540 + }, + { + "epoch": 1.2308306709265175, + "grad_norm": 0.32887372374534607, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1541 + }, + { + "epoch": 1.231629392971246, + "grad_norm": 0.3910810351371765, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1542 + }, + { + "epoch": 1.2324281150159744, + "grad_norm": 0.21341568231582642, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1543 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 0.10242578387260437, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1544 + }, + { + "epoch": 1.2340255591054312, + "grad_norm": 0.2556541860103607, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 1.2348242811501597, + "grad_norm": 0.22671715915203094, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1546 + }, + { + "epoch": 1.2356230031948883, + "grad_norm": 0.05781029909849167, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1547 + }, + { + "epoch": 1.2364217252396166, + "grad_norm": 0.2803215980529785, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1548 + }, + { + "epoch": 1.237220447284345, + "grad_norm": 0.3391420543193817, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1549 + }, + { + "epoch": 1.2380191693290734, + "grad_norm": 0.17648665606975555, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1550 + }, + { + "epoch": 1.238817891373802, + "grad_norm": 0.14975208044052124, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1551 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 0.2930659353733063, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1552 + }, + { + "epoch": 1.2404153354632588, + "grad_norm": 0.16080376505851746, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1553 + }, + { + "epoch": 1.2412140575079873, + "grad_norm": 0.1765553057193756, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1554 + }, + { + "epoch": 1.2420127795527156, + "grad_norm": 0.43610313534736633, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1555 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 0.3448547124862671, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1556 + }, + { + "epoch": 1.2436102236421724, + "grad_norm": 0.11257574707269669, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1557 + }, + { + "epoch": 1.244408945686901, + "grad_norm": 0.2212686389684677, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1558 + }, + { + "epoch": 1.2452076677316293, + "grad_norm": 0.24576987326145172, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 1559 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 0.07592078298330307, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1560 + }, + { + "epoch": 1.2468051118210863, + "grad_norm": 0.18566438555717468, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1561 + }, + { + "epoch": 1.2476038338658146, + "grad_norm": 0.2345304936170578, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1562 + }, + { + "epoch": 1.2484025559105432, + "grad_norm": 0.12168031930923462, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1563 + }, + { + "epoch": 1.2492012779552715, + "grad_norm": 0.10168169438838959, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1564 + }, + { + "epoch": 1.25, + "grad_norm": 0.14832071959972382, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1565 + }, + { + "epoch": 1.2507987220447285, + "grad_norm": 0.04516097158193588, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1566 + }, + { + "epoch": 1.2515974440894568, + "grad_norm": 0.14377422630786896, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1567 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 0.12483170628547668, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1568 + }, + { + "epoch": 1.2531948881789137, + "grad_norm": 0.06861971318721771, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1569 + }, + { + "epoch": 1.2539936102236422, + "grad_norm": 0.1124153807759285, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1570 + }, + { + "epoch": 1.2547923322683707, + "grad_norm": 0.16883404552936554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1571 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 0.09533397108316422, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1572 + }, + { + "epoch": 1.2563897763578276, + "grad_norm": 0.09215923398733139, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1573 + }, + { + "epoch": 1.2571884984025559, + "grad_norm": 0.12701599299907684, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1574 + }, + { + "epoch": 1.2579872204472844, + "grad_norm": 0.09106232225894928, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1575 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 0.047954440116882324, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1576 + }, + { + "epoch": 1.2595846645367412, + "grad_norm": 0.13917528092861176, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1577 + }, + { + "epoch": 1.2603833865814695, + "grad_norm": 0.17694029211997986, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1578 + }, + { + "epoch": 1.261182108626198, + "grad_norm": 0.11021065711975098, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1579 + }, + { + "epoch": 1.2619808306709266, + "grad_norm": 0.03982831537723541, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1580 + }, + { + "epoch": 1.262779552715655, + "grad_norm": 0.08759493380784988, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1581 + }, + { + "epoch": 1.2635782747603834, + "grad_norm": 0.04797520861029625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1582 + }, + { + "epoch": 1.2643769968051117, + "grad_norm": 0.049942485988140106, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 1583 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 0.04236803576350212, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1584 + }, + { + "epoch": 1.2659744408945688, + "grad_norm": 0.05938104912638664, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1585 + }, + { + "epoch": 1.266773162939297, + "grad_norm": 0.07487885653972626, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1586 + }, + { + "epoch": 1.2675718849840256, + "grad_norm": 0.063072569668293, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1587 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 0.07140504568815231, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1588 + }, + { + "epoch": 1.2691693290734825, + "grad_norm": 0.04790132865309715, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1589 + }, + { + "epoch": 1.269968051118211, + "grad_norm": 0.050013668835163116, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1590 + }, + { + "epoch": 1.2707667731629393, + "grad_norm": 0.0559731163084507, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1591 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 0.04633013904094696, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1592 + }, + { + "epoch": 1.2723642172523961, + "grad_norm": 0.05252271518111229, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1593 + }, + { + "epoch": 1.2731629392971247, + "grad_norm": 0.0902840718626976, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1594 + }, + { + "epoch": 1.273961661341853, + "grad_norm": 0.07961871474981308, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1595 + }, + { + "epoch": 1.2747603833865815, + "grad_norm": 0.07653608173131943, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1596 + }, + { + "epoch": 1.2755591054313098, + "grad_norm": 0.15634121000766754, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1597 + }, + { + "epoch": 1.2763578274760383, + "grad_norm": 0.2045222818851471, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1598 + }, + { + "epoch": 1.2771565495207668, + "grad_norm": 0.1769608110189438, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1599 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.09675133973360062, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1600 + }, + { + "epoch": 1.2787539936102237, + "grad_norm": 0.055832285434007645, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1601 + }, + { + "epoch": 1.279552715654952, + "grad_norm": 0.09108291566371918, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1602 + }, + { + "epoch": 1.2803514376996805, + "grad_norm": 0.10872901976108551, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1603 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 0.08771848678588867, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1604 + }, + { + "epoch": 1.2819488817891374, + "grad_norm": 0.0731026753783226, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1605 + }, + { + "epoch": 1.2827476038338659, + "grad_norm": 0.040664345026016235, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1606 + }, + { + "epoch": 1.2835463258785942, + "grad_norm": 0.06111081317067146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1607 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 0.08753795176744461, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1608 + }, + { + "epoch": 1.2851437699680512, + "grad_norm": 0.07113729417324066, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1609 + }, + { + "epoch": 1.2859424920127795, + "grad_norm": 0.05469372868537903, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1610 + }, + { + "epoch": 1.2867412140575079, + "grad_norm": 0.05748649686574936, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1611 + }, + { + "epoch": 1.2875399361022364, + "grad_norm": 0.05832446366548538, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1612 + }, + { + "epoch": 1.288338658146965, + "grad_norm": 0.06085522472858429, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1613 + }, + { + "epoch": 1.2891373801916932, + "grad_norm": 0.08154775947332382, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1614 + }, + { + "epoch": 1.2899361022364217, + "grad_norm": 0.11568816751241684, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1615 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 0.06356564909219742, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1616 + }, + { + "epoch": 1.2915335463258786, + "grad_norm": 0.08187399804592133, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1617 + }, + { + "epoch": 1.292332268370607, + "grad_norm": 0.05326744168996811, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1618 + }, + { + "epoch": 1.2931309904153354, + "grad_norm": 0.05407040938735008, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1619 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 0.07292867451906204, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 1620 + }, + { + "epoch": 1.2947284345047922, + "grad_norm": 0.09447437524795532, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1621 + }, + { + "epoch": 1.2955271565495208, + "grad_norm": 0.0592079721391201, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1622 + }, + { + "epoch": 1.2963258785942493, + "grad_norm": 0.052008479833602905, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1623 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 0.06381972879171371, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1624 + }, + { + "epoch": 1.2979233226837061, + "grad_norm": 0.07434900850057602, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1625 + }, + { + "epoch": 1.2987220447284344, + "grad_norm": 0.06477486342191696, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1626 + }, + { + "epoch": 1.299520766773163, + "grad_norm": 0.13730554282665253, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1627 + }, + { + "epoch": 1.3003194888178915, + "grad_norm": 0.1683935821056366, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1628 + }, + { + "epoch": 1.3011182108626198, + "grad_norm": 0.08616848289966583, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1629 + }, + { + "epoch": 1.3019169329073481, + "grad_norm": 0.10220590978860855, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1630 + }, + { + "epoch": 1.3027156549520766, + "grad_norm": 0.22036917507648468, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1631 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.2277965545654297, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1632 + }, + { + "epoch": 1.3043130990415335, + "grad_norm": 0.10426606982946396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1633 + }, + { + "epoch": 1.305111821086262, + "grad_norm": 0.06641022861003876, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1634 + }, + { + "epoch": 1.3059105431309903, + "grad_norm": 0.09100072830915451, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1635 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 0.06551069766283035, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1636 + }, + { + "epoch": 1.3075079872204474, + "grad_norm": 0.04397547245025635, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1637 + }, + { + "epoch": 1.3083067092651757, + "grad_norm": 0.0781746581196785, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1638 + }, + { + "epoch": 1.3091054313099042, + "grad_norm": 0.07852843403816223, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1639 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 0.09224545955657959, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1640 + }, + { + "epoch": 1.310702875399361, + "grad_norm": 0.10179189592599869, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1641 + }, + { + "epoch": 1.3115015974440896, + "grad_norm": 0.07562009245157242, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1642 + }, + { + "epoch": 1.3123003194888179, + "grad_norm": 0.15463820099830627, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1643 + }, + { + "epoch": 1.3130990415335464, + "grad_norm": 0.05742334946990013, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 1644 + }, + { + "epoch": 1.3138977635782747, + "grad_norm": 0.09010195732116699, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1645 + }, + { + "epoch": 1.3146964856230032, + "grad_norm": 0.04284297674894333, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1646 + }, + { + "epoch": 1.3154952076677318, + "grad_norm": 0.07167239487171173, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1647 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 0.04978404566645622, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1648 + }, + { + "epoch": 1.3170926517571884, + "grad_norm": 0.2888668477535248, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1649 + }, + { + "epoch": 1.317891373801917, + "grad_norm": 0.13716880977153778, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1650 + }, + { + "epoch": 1.3186900958466454, + "grad_norm": 0.13081762194633484, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1651 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 0.046977054327726364, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1652 + }, + { + "epoch": 1.3202875399361023, + "grad_norm": 0.1331615000963211, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 1653 + }, + { + "epoch": 1.3210862619808306, + "grad_norm": 0.21066126227378845, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 1654 + }, + { + "epoch": 1.321884984025559, + "grad_norm": 0.23017194867134094, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1655 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 0.20224629342556, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1656 + }, + { + "epoch": 1.323482428115016, + "grad_norm": 0.09836700558662415, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1657 + }, + { + "epoch": 1.3242811501597445, + "grad_norm": 0.10621663928031921, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1658 + }, + { + "epoch": 1.3250798722044728, + "grad_norm": 0.25464868545532227, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1659 + }, + { + "epoch": 1.3258785942492013, + "grad_norm": 0.39965251088142395, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1660 + }, + { + "epoch": 1.3266773162939298, + "grad_norm": 0.4731796383857727, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1661 + }, + { + "epoch": 1.3274760383386581, + "grad_norm": 0.4287014603614807, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1662 + }, + { + "epoch": 1.3282747603833867, + "grad_norm": 0.15660974383354187, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1663 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.14340882003307343, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1664 + }, + { + "epoch": 1.3298722044728435, + "grad_norm": 0.23041795194149017, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1665 + }, + { + "epoch": 1.330670926517572, + "grad_norm": 0.14607569575309753, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1666 + }, + { + "epoch": 1.3314696485623003, + "grad_norm": 0.0620175264775753, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1667 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 0.1722227782011032, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1668 + }, + { + "epoch": 1.3330670926517572, + "grad_norm": 0.17676329612731934, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1669 + }, + { + "epoch": 1.3338658146964857, + "grad_norm": 0.10175948590040207, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1670 + }, + { + "epoch": 1.334664536741214, + "grad_norm": 0.052259646356105804, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1671 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 0.11740414053201675, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1672 + }, + { + "epoch": 1.3362619808306708, + "grad_norm": 0.13614653050899506, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1673 + }, + { + "epoch": 1.3370607028753994, + "grad_norm": 0.12058388441801071, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 1674 + }, + { + "epoch": 1.3378594249201279, + "grad_norm": 0.12473122030496597, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1675 + }, + { + "epoch": 1.3386581469648562, + "grad_norm": 0.11198705434799194, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1676 + }, + { + "epoch": 1.3394568690095847, + "grad_norm": 0.06745828688144684, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1677 + }, + { + "epoch": 1.340255591054313, + "grad_norm": 0.06042877584695816, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1678 + }, + { + "epoch": 1.3410543130990416, + "grad_norm": 0.08762289583683014, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1679 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 0.07612926512956619, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1680 + }, + { + "epoch": 1.3426517571884984, + "grad_norm": 0.16108228266239166, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1681 + }, + { + "epoch": 1.343450479233227, + "grad_norm": 0.12803438305854797, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1682 + }, + { + "epoch": 1.3442492012779552, + "grad_norm": 0.09190207719802856, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1683 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 0.07201807200908661, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1684 + }, + { + "epoch": 1.3458466453674123, + "grad_norm": 0.06885793805122375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1685 + }, + { + "epoch": 1.3466453674121406, + "grad_norm": 0.06998719274997711, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1686 + }, + { + "epoch": 1.3474440894568689, + "grad_norm": 0.08072122186422348, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1687 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 0.1314389705657959, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1688 + }, + { + "epoch": 1.349041533546326, + "grad_norm": 0.1393643617630005, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1689 + }, + { + "epoch": 1.3498402555910542, + "grad_norm": 0.1482846736907959, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1690 + }, + { + "epoch": 1.3506389776357828, + "grad_norm": 0.10097873955965042, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1691 + }, + { + "epoch": 1.351437699680511, + "grad_norm": 0.16020123660564423, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1692 + }, + { + "epoch": 1.3522364217252396, + "grad_norm": 0.4032374322414398, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1693 + }, + { + "epoch": 1.3530351437699681, + "grad_norm": 0.21653197705745697, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1694 + }, + { + "epoch": 1.3538338658146964, + "grad_norm": 0.18634478747844696, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1695 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 0.06293921917676926, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1696 + }, + { + "epoch": 1.3554313099041533, + "grad_norm": 0.09862471371889114, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1697 + }, + { + "epoch": 1.3562300319488818, + "grad_norm": 0.17562821507453918, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1698 + }, + { + "epoch": 1.3570287539936103, + "grad_norm": 0.17277459800243378, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1699 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 0.06883158534765244, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1700 + }, + { + "epoch": 1.3586261980830672, + "grad_norm": 0.06487718969583511, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1701 + }, + { + "epoch": 1.3594249201277955, + "grad_norm": 0.08988886326551437, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1702 + }, + { + "epoch": 1.360223642172524, + "grad_norm": 0.05164919048547745, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1703 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 0.143778458237648, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1704 + }, + { + "epoch": 1.3618210862619808, + "grad_norm": 0.21736390888690948, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1705 + }, + { + "epoch": 1.3626198083067091, + "grad_norm": 0.2496086061000824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1706 + }, + { + "epoch": 1.3634185303514377, + "grad_norm": 0.21299317479133606, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1707 + }, + { + "epoch": 1.3642172523961662, + "grad_norm": 0.06845723092556, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1708 + }, + { + "epoch": 1.3650159744408945, + "grad_norm": 0.14018614590168, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1709 + }, + { + "epoch": 1.365814696485623, + "grad_norm": 0.1971539407968521, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1710 + }, + { + "epoch": 1.3666134185303513, + "grad_norm": 0.10819724202156067, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 1711 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 0.12900666892528534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1712 + }, + { + "epoch": 1.3682108626198084, + "grad_norm": 0.17080886662006378, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1713 + }, + { + "epoch": 1.3690095846645367, + "grad_norm": 0.22689902782440186, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1714 + }, + { + "epoch": 1.3698083067092652, + "grad_norm": 0.2200036197900772, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1715 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 0.15193268656730652, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1716 + }, + { + "epoch": 1.371405750798722, + "grad_norm": 0.057297177612781525, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1717 + }, + { + "epoch": 1.3722044728434506, + "grad_norm": 0.12024576961994171, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1718 + }, + { + "epoch": 1.373003194888179, + "grad_norm": 0.16183575987815857, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1719 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 0.14740106463432312, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 1720 + }, + { + "epoch": 1.3746006389776357, + "grad_norm": 0.09009548276662827, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1721 + }, + { + "epoch": 1.3753993610223643, + "grad_norm": 0.05091484636068344, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1722 + }, + { + "epoch": 1.3761980830670926, + "grad_norm": 0.05887647345662117, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1723 + }, + { + "epoch": 1.376996805111821, + "grad_norm": 0.06313642859458923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1724 + }, + { + "epoch": 1.3777955271565494, + "grad_norm": 0.06496263295412064, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1725 + }, + { + "epoch": 1.378594249201278, + "grad_norm": 0.06047922000288963, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1726 + }, + { + "epoch": 1.3793929712460065, + "grad_norm": 0.05579136312007904, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 1727 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 0.05931869521737099, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1728 + }, + { + "epoch": 1.3809904153354633, + "grad_norm": 0.049043234437704086, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1729 + }, + { + "epoch": 1.3817891373801916, + "grad_norm": 0.051883842796087265, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1730 + }, + { + "epoch": 1.3825878594249201, + "grad_norm": 0.07195441424846649, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1731 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 0.12339463829994202, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1732 + }, + { + "epoch": 1.384185303514377, + "grad_norm": 0.16951170563697815, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1733 + }, + { + "epoch": 1.3849840255591055, + "grad_norm": 0.1773078590631485, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1734 + }, + { + "epoch": 1.3857827476038338, + "grad_norm": 0.15160880982875824, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1735 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 0.12933489680290222, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1736 + }, + { + "epoch": 1.3873801916932909, + "grad_norm": 0.05910791456699371, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1737 + }, + { + "epoch": 1.3881789137380192, + "grad_norm": 0.06765501946210861, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1738 + }, + { + "epoch": 1.3889776357827475, + "grad_norm": 0.09179043024778366, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1739 + }, + { + "epoch": 1.389776357827476, + "grad_norm": 0.08842387795448303, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 1740 + }, + { + "epoch": 1.3905750798722045, + "grad_norm": 0.07700884342193604, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1741 + }, + { + "epoch": 1.3913738019169328, + "grad_norm": 0.045392196625471115, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1742 + }, + { + "epoch": 1.3921725239616614, + "grad_norm": 0.11977320909500122, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1743 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 0.1882479041814804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1744 + }, + { + "epoch": 1.3937699680511182, + "grad_norm": 0.25021475553512573, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1745 + }, + { + "epoch": 1.3945686900958467, + "grad_norm": 0.23374556005001068, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1746 + }, + { + "epoch": 1.395367412140575, + "grad_norm": 0.1016339659690857, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1747 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 0.1340985745191574, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1748 + }, + { + "epoch": 1.3969648562300319, + "grad_norm": 0.21048963069915771, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1749 + }, + { + "epoch": 1.3977635782747604, + "grad_norm": 0.20711666345596313, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1750 + }, + { + "epoch": 1.398562300319489, + "grad_norm": 0.19101384282112122, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1751 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 0.17655788362026215, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1752 + }, + { + "epoch": 1.4001597444089458, + "grad_norm": 0.11994078010320663, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1753 + }, + { + "epoch": 1.400958466453674, + "grad_norm": 0.09805315732955933, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1754 + }, + { + "epoch": 1.4017571884984026, + "grad_norm": 0.07474519312381744, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1755 + }, + { + "epoch": 1.4025559105431311, + "grad_norm": 0.11269772797822952, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1756 + }, + { + "epoch": 1.4033546325878594, + "grad_norm": 0.08900775015354156, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1757 + }, + { + "epoch": 1.4041533546325877, + "grad_norm": 0.05614674836397171, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1758 + }, + { + "epoch": 1.4049520766773163, + "grad_norm": 0.12895621359348297, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1759 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 0.16433797776699066, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1760 + }, + { + "epoch": 1.406549520766773, + "grad_norm": 0.20009422302246094, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1761 + }, + { + "epoch": 1.4073482428115016, + "grad_norm": 0.146495059132576, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1762 + }, + { + "epoch": 1.40814696485623, + "grad_norm": 0.07518120110034943, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1763 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 0.09864111244678497, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1764 + }, + { + "epoch": 1.409744408945687, + "grad_norm": 0.20213425159454346, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 1765 + }, + { + "epoch": 1.4105431309904153, + "grad_norm": 0.17369656264781952, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1766 + }, + { + "epoch": 1.4113418530351438, + "grad_norm": 0.06627536565065384, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1767 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 0.09098218381404877, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1768 + }, + { + "epoch": 1.4129392971246006, + "grad_norm": 0.11730248481035233, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1769 + }, + { + "epoch": 1.4137380191693292, + "grad_norm": 0.07061973959207535, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1770 + }, + { + "epoch": 1.4145367412140575, + "grad_norm": 0.10279946774244308, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1771 + }, + { + "epoch": 1.415335463258786, + "grad_norm": 0.18082919716835022, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1772 + }, + { + "epoch": 1.4161341853035143, + "grad_norm": 0.1592867076396942, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1773 + }, + { + "epoch": 1.4169329073482428, + "grad_norm": 0.09976492077112198, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1774 + }, + { + "epoch": 1.4177316293929714, + "grad_norm": 0.060737378895282745, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1775 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 0.06248186528682709, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1776 + }, + { + "epoch": 1.419329073482428, + "grad_norm": 0.13300968706607819, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1777 + }, + { + "epoch": 1.4201277955271565, + "grad_norm": 0.1979697346687317, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1778 + }, + { + "epoch": 1.420926517571885, + "grad_norm": 0.23268306255340576, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1779 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 0.18313626945018768, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1780 + }, + { + "epoch": 1.4225239616613419, + "grad_norm": 0.08110051602125168, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1781 + }, + { + "epoch": 1.4233226837060702, + "grad_norm": 0.09732743352651596, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1782 + }, + { + "epoch": 1.4241214057507987, + "grad_norm": 0.1656067669391632, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1783 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 0.1959427297115326, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1784 + }, + { + "epoch": 1.4257188498402555, + "grad_norm": 0.17609809339046478, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1785 + }, + { + "epoch": 1.426517571884984, + "grad_norm": 0.0999840646982193, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1786 + }, + { + "epoch": 1.4273162939297124, + "grad_norm": 0.06475909799337387, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1787 + }, + { + "epoch": 1.428115015974441, + "grad_norm": 0.1364496946334839, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1788 + }, + { + "epoch": 1.4289137380191694, + "grad_norm": 0.21113638579845428, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 1789 + }, + { + "epoch": 1.4297124600638977, + "grad_norm": 0.25998085737228394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1790 + }, + { + "epoch": 1.4305111821086263, + "grad_norm": 0.24930700659751892, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1791 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 0.131307452917099, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1792 + }, + { + "epoch": 1.432108626198083, + "grad_norm": 0.0739457756280899, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1793 + }, + { + "epoch": 1.4329073482428116, + "grad_norm": 0.2009744644165039, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1794 + }, + { + "epoch": 1.43370607028754, + "grad_norm": 0.28875023126602173, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1795 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 0.25421038269996643, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1796 + }, + { + "epoch": 1.4353035143769968, + "grad_norm": 0.09670932590961456, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1797 + }, + { + "epoch": 1.4361022364217253, + "grad_norm": 0.11264955252408981, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1798 + }, + { + "epoch": 1.4369009584664536, + "grad_norm": 0.1401909440755844, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1799 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 0.08234099298715591, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1800 + }, + { + "epoch": 1.4384984025559104, + "grad_norm": 0.05028436705470085, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1801 + }, + { + "epoch": 1.439297124600639, + "grad_norm": 0.04673704132437706, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1802 + }, + { + "epoch": 1.4400958466453675, + "grad_norm": 0.07369101047515869, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 1803 + }, + { + "epoch": 1.4408945686900958, + "grad_norm": 0.161424919962883, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1804 + }, + { + "epoch": 1.4416932907348243, + "grad_norm": 0.13576306402683258, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1805 + }, + { + "epoch": 1.4424920127795526, + "grad_norm": 0.063505619764328, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 1806 + }, + { + "epoch": 1.4432907348242812, + "grad_norm": 0.07231617718935013, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1807 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 0.1698617935180664, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1808 + }, + { + "epoch": 1.444888178913738, + "grad_norm": 0.16520395874977112, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1809 + }, + { + "epoch": 1.4456869009584665, + "grad_norm": 0.058485522866249084, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1810 + }, + { + "epoch": 1.4464856230031948, + "grad_norm": 0.0816773921251297, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1811 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 0.15307661890983582, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1812 + }, + { + "epoch": 1.4480830670926519, + "grad_norm": 0.20710408687591553, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 1813 + }, + { + "epoch": 1.4488817891373802, + "grad_norm": 0.1786869764328003, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1814 + }, + { + "epoch": 1.4496805111821085, + "grad_norm": 0.07363469898700714, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1815 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 0.10158272087574005, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1816 + }, + { + "epoch": 1.4512779552715656, + "grad_norm": 0.14304493367671967, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1817 + }, + { + "epoch": 1.4520766773162939, + "grad_norm": 0.11782495677471161, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1818 + }, + { + "epoch": 1.4528753993610224, + "grad_norm": 0.09340433776378632, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1819 + }, + { + "epoch": 1.4536741214057507, + "grad_norm": 0.08881603926420212, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1820 + }, + { + "epoch": 1.4544728434504792, + "grad_norm": 0.1377323865890503, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1821 + }, + { + "epoch": 1.4552715654952078, + "grad_norm": 0.1137915700674057, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1822 + }, + { + "epoch": 1.456070287539936, + "grad_norm": 0.08219580352306366, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1823 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 0.048282165080308914, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 1824 + }, + { + "epoch": 1.457667731629393, + "grad_norm": 0.07061316817998886, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1825 + }, + { + "epoch": 1.4584664536741214, + "grad_norm": 0.09383007138967514, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1826 + }, + { + "epoch": 1.45926517571885, + "grad_norm": 0.10688310861587524, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1827 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 0.09751323610544205, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 1828 + }, + { + "epoch": 1.4608626198083068, + "grad_norm": 0.10437846183776855, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1829 + }, + { + "epoch": 1.461661341853035, + "grad_norm": 0.13903124630451202, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1830 + }, + { + "epoch": 1.4624600638977636, + "grad_norm": 0.09480495005846024, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1831 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 0.062304843217134476, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1832 + }, + { + "epoch": 1.4640575079872205, + "grad_norm": 0.13482356071472168, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1833 + }, + { + "epoch": 1.4648562300319488, + "grad_norm": 0.2302182912826538, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1834 + }, + { + "epoch": 1.4656549520766773, + "grad_norm": 0.28565964102745056, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1835 + }, + { + "epoch": 1.4664536741214058, + "grad_norm": 0.28437626361846924, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1836 + }, + { + "epoch": 1.4672523961661341, + "grad_norm": 0.20637334883213043, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1837 + }, + { + "epoch": 1.4680511182108626, + "grad_norm": 0.08829299360513687, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1838 + }, + { + "epoch": 1.468849840255591, + "grad_norm": 0.06338132172822952, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 1839 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 0.13094602525234222, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1840 + }, + { + "epoch": 1.470447284345048, + "grad_norm": 0.15911467373371124, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1841 + }, + { + "epoch": 1.4712460063897763, + "grad_norm": 0.10913829505443573, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 1842 + }, + { + "epoch": 1.4720447284345048, + "grad_norm": 0.06934744864702225, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1843 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.07930968701839447, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1844 + }, + { + "epoch": 1.4736421725239617, + "grad_norm": 0.11225491017103195, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1845 + }, + { + "epoch": 1.4744408945686902, + "grad_norm": 0.12815739214420319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1846 + }, + { + "epoch": 1.4752396166134185, + "grad_norm": 0.0943179577589035, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1847 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 0.051353566348552704, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1848 + }, + { + "epoch": 1.4768370607028753, + "grad_norm": 0.10284367203712463, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1849 + }, + { + "epoch": 1.4776357827476039, + "grad_norm": 0.18345551192760468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1850 + }, + { + "epoch": 1.4784345047923324, + "grad_norm": 0.19532762467861176, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1851 + }, + { + "epoch": 1.4792332268370607, + "grad_norm": 0.12518467009067535, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1852 + }, + { + "epoch": 1.480031948881789, + "grad_norm": 0.05363085865974426, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1853 + }, + { + "epoch": 1.4808306709265175, + "grad_norm": 0.18222568929195404, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 1854 + }, + { + "epoch": 1.481629392971246, + "grad_norm": 0.19992542266845703, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1855 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 0.1724570095539093, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1856 + }, + { + "epoch": 1.483226837060703, + "grad_norm": 0.04096012935042381, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1857 + }, + { + "epoch": 1.4840255591054312, + "grad_norm": 0.15409474074840546, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1858 + }, + { + "epoch": 1.4848242811501597, + "grad_norm": 0.29238876700401306, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1859 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 0.35619401931762695, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1860 + }, + { + "epoch": 1.4864217252396166, + "grad_norm": 0.2790282964706421, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1861 + }, + { + "epoch": 1.487220447284345, + "grad_norm": 0.0809629037976265, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1862 + }, + { + "epoch": 1.4880191693290734, + "grad_norm": 0.1827513724565506, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1863 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 0.2284395545721054, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1864 + }, + { + "epoch": 1.4896166134185305, + "grad_norm": 0.11697912216186523, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1865 + }, + { + "epoch": 1.4904153354632588, + "grad_norm": 0.08668534457683563, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 1866 + }, + { + "epoch": 1.4912140575079873, + "grad_norm": 0.19793611764907837, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1867 + }, + { + "epoch": 1.4920127795527156, + "grad_norm": 0.18775872886180878, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1868 + }, + { + "epoch": 1.4928115015974441, + "grad_norm": 0.07068412005901337, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1869 + }, + { + "epoch": 1.4936102236421724, + "grad_norm": 0.07640416920185089, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1870 + }, + { + "epoch": 1.494408945686901, + "grad_norm": 0.1333264708518982, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1871 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 0.13000380992889404, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1872 + }, + { + "epoch": 1.4960063897763578, + "grad_norm": 0.05382491648197174, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1873 + }, + { + "epoch": 1.4968051118210863, + "grad_norm": 0.12773285806179047, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1874 + }, + { + "epoch": 1.4976038338658146, + "grad_norm": 0.2441176027059555, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1875 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 0.26628851890563965, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1876 + }, + { + "epoch": 1.4992012779552715, + "grad_norm": 0.1295953392982483, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1877 + }, + { + "epoch": 1.5, + "grad_norm": 0.10860511660575867, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1878 + }, + { + "epoch": 1.5007987220447285, + "grad_norm": 0.25177180767059326, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1879 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 0.2379150688648224, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1880 + }, + { + "epoch": 1.5023961661341851, + "grad_norm": 0.101965993642807, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1881 + }, + { + "epoch": 1.5031948881789137, + "grad_norm": 0.15633052587509155, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1882 + }, + { + "epoch": 1.5039936102236422, + "grad_norm": 0.3071416914463043, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1883 + }, + { + "epoch": 1.5047923322683707, + "grad_norm": 0.2126736044883728, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1884 + }, + { + "epoch": 1.505591054313099, + "grad_norm": 0.05252298340201378, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1885 + }, + { + "epoch": 1.5063897763578273, + "grad_norm": 0.23854316771030426, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1886 + }, + { + "epoch": 1.5071884984025559, + "grad_norm": 0.305148720741272, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1887 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 0.1371227502822876, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1888 + }, + { + "epoch": 1.508785942492013, + "grad_norm": 0.16433516144752502, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1889 + }, + { + "epoch": 1.5095846645367412, + "grad_norm": 0.24010877311229706, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1890 + }, + { + "epoch": 1.5103833865814695, + "grad_norm": 0.12839943170547485, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1891 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 0.055945366621017456, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1892 + }, + { + "epoch": 1.5119808306709266, + "grad_norm": 0.16645023226737976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1893 + }, + { + "epoch": 1.5127795527156551, + "grad_norm": 0.14626996219158173, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1894 + }, + { + "epoch": 1.5135782747603834, + "grad_norm": 0.04274629056453705, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1895 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 0.10497253388166428, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1896 + }, + { + "epoch": 1.5151757188498403, + "grad_norm": 0.159364715218544, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 1897 + }, + { + "epoch": 1.5159744408945688, + "grad_norm": 0.11409968137741089, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1898 + }, + { + "epoch": 1.516773162939297, + "grad_norm": 0.03989424183964729, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1899 + }, + { + "epoch": 1.5175718849840254, + "grad_norm": 0.12703374028205872, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1900 + }, + { + "epoch": 1.518370607028754, + "grad_norm": 0.20534875988960266, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1901 + }, + { + "epoch": 1.5191693290734825, + "grad_norm": 0.2276938110589981, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1902 + }, + { + "epoch": 1.519968051118211, + "grad_norm": 0.114278644323349, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1903 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 0.08295118063688278, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 1904 + }, + { + "epoch": 1.5215654952076676, + "grad_norm": 0.18610796332359314, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1905 + }, + { + "epoch": 1.5223642172523961, + "grad_norm": 0.1920524388551712, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1906 + }, + { + "epoch": 1.5231629392971247, + "grad_norm": 0.06447675824165344, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1907 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 0.17821159958839417, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1908 + }, + { + "epoch": 1.5247603833865815, + "grad_norm": 0.23894363641738892, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1909 + }, + { + "epoch": 1.5255591054313098, + "grad_norm": 0.14711391925811768, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1910 + }, + { + "epoch": 1.5263578274760383, + "grad_norm": 0.07863837480545044, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1911 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 0.20990678668022156, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 1912 + }, + { + "epoch": 1.5279552715654952, + "grad_norm": 0.19979886710643768, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1913 + }, + { + "epoch": 1.5287539936102237, + "grad_norm": 0.0871618464589119, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1914 + }, + { + "epoch": 1.529552715654952, + "grad_norm": 0.09294576942920685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1915 + }, + { + "epoch": 1.5303514376996805, + "grad_norm": 0.23010258376598358, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1916 + }, + { + "epoch": 1.531150159744409, + "grad_norm": 0.2919708788394928, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 1917 + }, + { + "epoch": 1.5319488817891374, + "grad_norm": 0.21767428517341614, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1918 + }, + { + "epoch": 1.5327476038338657, + "grad_norm": 0.07844182848930359, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1919 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 0.14891114830970764, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1920 + }, + { + "epoch": 1.5343450479233227, + "grad_norm": 0.17959977686405182, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1921 + }, + { + "epoch": 1.5351437699680512, + "grad_norm": 0.10217028856277466, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1922 + }, + { + "epoch": 1.5359424920127795, + "grad_norm": 0.08135818690061569, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1923 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 0.19660547375679016, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1924 + }, + { + "epoch": 1.5375399361022364, + "grad_norm": 0.2106354534626007, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1925 + }, + { + "epoch": 1.538338658146965, + "grad_norm": 0.11042182147502899, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1926 + }, + { + "epoch": 1.5391373801916934, + "grad_norm": 0.08777181059122086, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1927 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 0.18283812701702118, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1928 + }, + { + "epoch": 1.54073482428115, + "grad_norm": 0.11731691658496857, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1929 + }, + { + "epoch": 1.5415335463258786, + "grad_norm": 0.04163304716348648, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1930 + }, + { + "epoch": 1.542332268370607, + "grad_norm": 0.12119868397712708, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1931 + }, + { + "epoch": 1.5431309904153354, + "grad_norm": 0.18475785851478577, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1932 + }, + { + "epoch": 1.543929712460064, + "grad_norm": 0.16582897305488586, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1933 + }, + { + "epoch": 1.5447284345047922, + "grad_norm": 0.086383156478405, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1934 + }, + { + "epoch": 1.5455271565495208, + "grad_norm": 0.047143738716840744, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1935 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 0.0830119326710701, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1936 + }, + { + "epoch": 1.5471246006389776, + "grad_norm": 0.14226214587688446, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1937 + }, + { + "epoch": 1.547923322683706, + "grad_norm": 0.1719929724931717, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1938 + }, + { + "epoch": 1.5487220447284344, + "grad_norm": 0.18388192355632782, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1939 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 0.16870245337486267, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1940 + }, + { + "epoch": 1.5503194888178915, + "grad_norm": 0.1100412905216217, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1941 + }, + { + "epoch": 1.5511182108626198, + "grad_norm": 0.05124165490269661, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1942 + }, + { + "epoch": 1.5519169329073481, + "grad_norm": 0.08937443792819977, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1943 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 0.13589949905872345, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1944 + }, + { + "epoch": 1.5535143769968052, + "grad_norm": 0.12346407026052475, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1945 + }, + { + "epoch": 1.5543130990415337, + "grad_norm": 0.11836438626050949, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1946 + }, + { + "epoch": 1.555111821086262, + "grad_norm": 0.07569031417369843, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1947 + }, + { + "epoch": 1.5559105431309903, + "grad_norm": 0.039178211241960526, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1948 + }, + { + "epoch": 1.5567092651757188, + "grad_norm": 0.0431843139231205, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1949 + }, + { + "epoch": 1.5575079872204474, + "grad_norm": 0.06331207603216171, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1950 + }, + { + "epoch": 1.5583067092651757, + "grad_norm": 0.0670275092124939, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1951 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 0.04372883588075638, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1952 + }, + { + "epoch": 1.5599041533546325, + "grad_norm": 0.15768256783485413, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1953 + }, + { + "epoch": 1.560702875399361, + "grad_norm": 0.30828192830085754, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1954 + }, + { + "epoch": 1.5615015974440896, + "grad_norm": 0.3741140365600586, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1955 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 0.25689223408699036, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1956 + }, + { + "epoch": 1.5630990415335462, + "grad_norm": 0.0691552683711052, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1957 + }, + { + "epoch": 1.5638977635782747, + "grad_norm": 0.2742094099521637, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1958 + }, + { + "epoch": 1.5646964856230032, + "grad_norm": 0.2760325074195862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1959 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 0.09094057232141495, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1960 + }, + { + "epoch": 1.56629392971246, + "grad_norm": 0.11926092952489853, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1961 + }, + { + "epoch": 1.5670926517571884, + "grad_norm": 0.18398839235305786, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1962 + }, + { + "epoch": 1.567891373801917, + "grad_norm": 0.17090962827205658, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1963 + }, + { + "epoch": 1.5686900958466454, + "grad_norm": 0.07806222885847092, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1964 + }, + { + "epoch": 1.569488817891374, + "grad_norm": 0.17260140180587769, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1965 + }, + { + "epoch": 1.5702875399361023, + "grad_norm": 0.2848401665687561, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1966 + }, + { + "epoch": 1.5710862619808306, + "grad_norm": 0.19075879454612732, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1967 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 0.044234778732061386, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 1968 + }, + { + "epoch": 1.5726837060702876, + "grad_norm": 0.16188788414001465, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1969 + }, + { + "epoch": 1.573482428115016, + "grad_norm": 0.19148766994476318, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1970 + }, + { + "epoch": 1.5742811501597445, + "grad_norm": 0.11576604843139648, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1971 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 0.049716517329216, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1972 + }, + { + "epoch": 1.5758785942492013, + "grad_norm": 0.12528614699840546, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1973 + }, + { + "epoch": 1.5766773162939298, + "grad_norm": 0.1574268341064453, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1974 + }, + { + "epoch": 1.5774760383386581, + "grad_norm": 0.06606525182723999, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1975 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 0.16142094135284424, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1976 + }, + { + "epoch": 1.579073482428115, + "grad_norm": 0.29769718647003174, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1977 + }, + { + "epoch": 1.5798722044728435, + "grad_norm": 0.20111548900604248, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1978 + }, + { + "epoch": 1.580670926517572, + "grad_norm": 0.06375493854284286, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1979 + }, + { + "epoch": 1.5814696485623003, + "grad_norm": 0.2208068072795868, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1980 + }, + { + "epoch": 1.5822683706070286, + "grad_norm": 0.2920839488506317, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1981 + }, + { + "epoch": 1.5830670926517572, + "grad_norm": 0.2115958034992218, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1982 + }, + { + "epoch": 1.5838658146964857, + "grad_norm": 0.048249468207359314, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1983 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 0.15551301836967468, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1984 + }, + { + "epoch": 1.5854632587859425, + "grad_norm": 0.2190883755683899, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 1985 + }, + { + "epoch": 1.5862619808306708, + "grad_norm": 0.15155111253261566, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1986 + }, + { + "epoch": 1.5870607028753994, + "grad_norm": 0.056616391986608505, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1987 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 0.1638905555009842, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1988 + }, + { + "epoch": 1.5886581469648562, + "grad_norm": 0.11643283069133759, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1989 + }, + { + "epoch": 1.5894568690095847, + "grad_norm": 0.06423045694828033, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1990 + }, + { + "epoch": 1.590255591054313, + "grad_norm": 0.11044095456600189, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1991 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 0.11911707371473312, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1992 + }, + { + "epoch": 1.59185303514377, + "grad_norm": 0.045604925602674484, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1993 + }, + { + "epoch": 1.5926517571884984, + "grad_norm": 0.10280558466911316, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1994 + }, + { + "epoch": 1.5934504792332267, + "grad_norm": 0.13807371258735657, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1995 + }, + { + "epoch": 1.5942492012779552, + "grad_norm": 0.06163270026445389, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1996 + }, + { + "epoch": 1.5950479233226837, + "grad_norm": 0.12899963557720184, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1997 + }, + { + "epoch": 1.5958466453674123, + "grad_norm": 0.24358411133289337, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 1998 + }, + { + "epoch": 1.5966453674121406, + "grad_norm": 0.23341934382915497, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1999 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 0.11766334623098373, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2000 + }, + { + "epoch": 1.5982428115015974, + "grad_norm": 0.07918071746826172, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2001 + }, + { + "epoch": 1.599041533546326, + "grad_norm": 0.1473437398672104, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2002 + }, + { + "epoch": 1.5998402555910545, + "grad_norm": 0.08945708721876144, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2003 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.06553255021572113, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2004 + }, + { + "epoch": 1.601437699680511, + "grad_norm": 0.12708786129951477, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2005 + }, + { + "epoch": 1.6022364217252396, + "grad_norm": 0.16935905814170837, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2006 + }, + { + "epoch": 1.6030351437699681, + "grad_norm": 0.10428016632795334, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2007 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 0.06016766279935837, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 2008 + }, + { + "epoch": 1.604632587859425, + "grad_norm": 0.1563751995563507, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2009 + }, + { + "epoch": 1.6054313099041533, + "grad_norm": 0.1919829398393631, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2010 + }, + { + "epoch": 1.6062300319488818, + "grad_norm": 0.14739179611206055, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2011 + }, + { + "epoch": 1.6070287539936103, + "grad_norm": 0.08086550235748291, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2012 + }, + { + "epoch": 1.6078274760383386, + "grad_norm": 0.06594815105199814, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2013 + }, + { + "epoch": 1.608626198083067, + "grad_norm": 0.10502789169549942, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2014 + }, + { + "epoch": 1.6094249201277955, + "grad_norm": 0.1312190145254135, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2015 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 0.062411367893218994, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2016 + }, + { + "epoch": 1.6110223642172525, + "grad_norm": 0.04986036196351051, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2017 + }, + { + "epoch": 1.6118210862619808, + "grad_norm": 0.08428573608398438, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2018 + }, + { + "epoch": 1.6126198083067091, + "grad_norm": 0.11552372574806213, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2019 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 0.07657046616077423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2020 + }, + { + "epoch": 1.6142172523961662, + "grad_norm": 0.05540962517261505, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 2021 + }, + { + "epoch": 1.6150159744408947, + "grad_norm": 0.048573557287454605, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2022 + }, + { + "epoch": 1.615814696485623, + "grad_norm": 0.08630840480327606, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2023 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 0.06090754270553589, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2024 + }, + { + "epoch": 1.6174121405750799, + "grad_norm": 0.05828041955828667, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2025 + }, + { + "epoch": 1.6182108626198084, + "grad_norm": 0.12483426928520203, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2026 + }, + { + "epoch": 1.6190095846645367, + "grad_norm": 0.13772840797901154, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2027 + }, + { + "epoch": 1.619808306709265, + "grad_norm": 0.08477568626403809, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2028 + }, + { + "epoch": 1.6206070287539935, + "grad_norm": 0.037577688694000244, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2029 + }, + { + "epoch": 1.621405750798722, + "grad_norm": 0.07961893081665039, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2030 + }, + { + "epoch": 1.6222044728434506, + "grad_norm": 0.06744182854890823, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2031 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 0.06228869408369064, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2032 + }, + { + "epoch": 1.6238019169329072, + "grad_norm": 0.1972920298576355, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2033 + }, + { + "epoch": 1.6246006389776357, + "grad_norm": 0.2701529562473297, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2034 + }, + { + "epoch": 1.6253993610223643, + "grad_norm": 0.20371970534324646, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2035 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 0.08887646347284317, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2036 + }, + { + "epoch": 1.626996805111821, + "grad_norm": 0.06480003893375397, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2037 + }, + { + "epoch": 1.6277955271565494, + "grad_norm": 0.089780792593956, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2038 + }, + { + "epoch": 1.628594249201278, + "grad_norm": 0.04014933854341507, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2039 + }, + { + "epoch": 1.6293929712460065, + "grad_norm": 0.0993470847606659, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 2040 + }, + { + "epoch": 1.630191693290735, + "grad_norm": 0.1957429200410843, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2041 + }, + { + "epoch": 1.6309904153354633, + "grad_norm": 0.2273249477148056, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2042 + }, + { + "epoch": 1.6317891373801916, + "grad_norm": 0.1936638057231903, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2043 + }, + { + "epoch": 1.6325878594249201, + "grad_norm": 0.10150687396526337, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2044 + }, + { + "epoch": 1.6333865814696487, + "grad_norm": 0.051224563270807266, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2045 + }, + { + "epoch": 1.634185303514377, + "grad_norm": 0.13044138252735138, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2046 + }, + { + "epoch": 1.6349840255591053, + "grad_norm": 0.16140064597129822, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2047 + }, + { + "epoch": 1.6357827476038338, + "grad_norm": 0.13187173008918762, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2048 + }, + { + "epoch": 1.6365814696485623, + "grad_norm": 0.03873397782444954, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2049 + }, + { + "epoch": 1.6373801916932909, + "grad_norm": 0.0575883649289608, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2050 + }, + { + "epoch": 1.6381789137380192, + "grad_norm": 0.039476748555898666, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 2051 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 0.06802869588136673, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2052 + }, + { + "epoch": 1.639776357827476, + "grad_norm": 0.059946198016405106, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2053 + }, + { + "epoch": 1.6405750798722045, + "grad_norm": 0.05185665935277939, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2054 + }, + { + "epoch": 1.641373801916933, + "grad_norm": 0.08230192214250565, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2055 + }, + { + "epoch": 1.6421725239616614, + "grad_norm": 0.10175196081399918, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2056 + }, + { + "epoch": 1.6429712460063897, + "grad_norm": 0.07616171985864639, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2057 + }, + { + "epoch": 1.6437699680511182, + "grad_norm": 0.4597811698913574, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2058 + }, + { + "epoch": 1.6445686900958467, + "grad_norm": 0.12450811266899109, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2059 + }, + { + "epoch": 1.645367412140575, + "grad_norm": 0.10847678035497665, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2060 + }, + { + "epoch": 1.6461661341853036, + "grad_norm": 0.05778864026069641, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2061 + }, + { + "epoch": 1.6469648562300319, + "grad_norm": 0.04321129992604256, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2062 + }, + { + "epoch": 1.6477635782747604, + "grad_norm": 0.05467045307159424, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2063 + }, + { + "epoch": 1.648562300319489, + "grad_norm": 0.044298864901065826, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2064 + }, + { + "epoch": 1.6493610223642172, + "grad_norm": 0.03863062337040901, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2065 + }, + { + "epoch": 1.6501597444089455, + "grad_norm": 0.04040979593992233, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2066 + }, + { + "epoch": 1.650958466453674, + "grad_norm": 0.03647322207689285, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2067 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 0.049459293484687805, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2068 + }, + { + "epoch": 1.6525559105431311, + "grad_norm": 0.052851296961307526, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2069 + }, + { + "epoch": 1.6533546325878594, + "grad_norm": 0.10360822081565857, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2070 + }, + { + "epoch": 1.6541533546325877, + "grad_norm": 0.18817105889320374, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2071 + }, + { + "epoch": 1.6549520766773163, + "grad_norm": 0.1711605340242386, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2072 + }, + { + "epoch": 1.6557507987220448, + "grad_norm": 0.08807278424501419, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2073 + }, + { + "epoch": 1.6565495207667733, + "grad_norm": 0.0631125420331955, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2074 + }, + { + "epoch": 1.6573482428115016, + "grad_norm": 0.17277394235134125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2075 + }, + { + "epoch": 1.65814696485623, + "grad_norm": 0.2353454977273941, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2076 + }, + { + "epoch": 1.6589456869009584, + "grad_norm": 0.18835891783237457, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2077 + }, + { + "epoch": 1.659744408945687, + "grad_norm": 0.08717352151870728, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2078 + }, + { + "epoch": 1.6605431309904153, + "grad_norm": 0.05640486627817154, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2079 + }, + { + "epoch": 1.6613418530351438, + "grad_norm": 0.11206189543008804, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2080 + }, + { + "epoch": 1.6621405750798721, + "grad_norm": 0.10098055750131607, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2081 + }, + { + "epoch": 1.6629392971246006, + "grad_norm": 0.04627184569835663, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2082 + }, + { + "epoch": 1.6637380191693292, + "grad_norm": 0.13048212230205536, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2083 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 0.22329512238502502, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2084 + }, + { + "epoch": 1.6653354632587858, + "grad_norm": 0.23544666171073914, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2085 + }, + { + "epoch": 1.6661341853035143, + "grad_norm": 0.1329459846019745, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2086 + }, + { + "epoch": 1.6669329073482428, + "grad_norm": 0.07398947328329086, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2087 + }, + { + "epoch": 1.6677316293929714, + "grad_norm": 0.1926809549331665, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2088 + }, + { + "epoch": 1.6685303514376997, + "grad_norm": 0.19097647070884705, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2089 + }, + { + "epoch": 1.669329073482428, + "grad_norm": 0.10474745184183121, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2090 + }, + { + "epoch": 1.6701277955271565, + "grad_norm": 0.04437112435698509, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2091 + }, + { + "epoch": 1.670926517571885, + "grad_norm": 0.13698135316371918, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2092 + }, + { + "epoch": 1.6717252396166136, + "grad_norm": 0.14437462389469147, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2093 + }, + { + "epoch": 1.6725239616613419, + "grad_norm": 0.0938732922077179, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2094 + }, + { + "epoch": 1.6733226837060702, + "grad_norm": 0.060729511082172394, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2095 + }, + { + "epoch": 1.6741214057507987, + "grad_norm": 0.05354619398713112, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2096 + }, + { + "epoch": 1.6749201277955272, + "grad_norm": 0.056909799575805664, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2097 + }, + { + "epoch": 1.6757188498402555, + "grad_norm": 0.09815286099910736, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2098 + }, + { + "epoch": 1.676517571884984, + "grad_norm": 0.1432102620601654, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2099 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 0.14039601385593414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2100 + }, + { + "epoch": 1.678115015974441, + "grad_norm": 0.06634008139371872, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2101 + }, + { + "epoch": 1.6789137380191694, + "grad_norm": 0.1347021609544754, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2102 + }, + { + "epoch": 1.6797124600638977, + "grad_norm": 0.24721868336200714, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2103 + }, + { + "epoch": 1.680511182108626, + "grad_norm": 0.23194770514965057, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2104 + }, + { + "epoch": 1.6813099041533546, + "grad_norm": 0.12276436388492584, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2105 + }, + { + "epoch": 1.682108626198083, + "grad_norm": 0.06224825233221054, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2106 + }, + { + "epoch": 1.6829073482428116, + "grad_norm": 0.20683766901493073, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2107 + }, + { + "epoch": 1.68370607028754, + "grad_norm": 0.26914462447166443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2108 + }, + { + "epoch": 1.6845047923322682, + "grad_norm": 0.20070654153823853, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2109 + }, + { + "epoch": 1.6853035143769968, + "grad_norm": 0.08465532958507538, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2110 + }, + { + "epoch": 1.6861022364217253, + "grad_norm": 0.10843367129564285, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2111 + }, + { + "epoch": 1.6869009584664538, + "grad_norm": 0.20252646505832672, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2112 + }, + { + "epoch": 1.6876996805111821, + "grad_norm": 0.11803672462701797, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2113 + }, + { + "epoch": 1.6884984025559104, + "grad_norm": 0.08800901472568512, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2114 + }, + { + "epoch": 1.689297124600639, + "grad_norm": 0.23917800188064575, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2115 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 0.21528035402297974, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2116 + }, + { + "epoch": 1.6908945686900958, + "grad_norm": 0.05292942747473717, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2117 + }, + { + "epoch": 1.6916932907348243, + "grad_norm": 0.12942583858966827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2118 + }, + { + "epoch": 1.6924920127795526, + "grad_norm": 0.19304881989955902, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2119 + }, + { + "epoch": 1.6932907348242812, + "grad_norm": 0.10951094329357147, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 2120 + }, + { + "epoch": 1.6940894568690097, + "grad_norm": 0.07684643566608429, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2121 + }, + { + "epoch": 1.694888178913738, + "grad_norm": 0.14990608394145966, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2122 + }, + { + "epoch": 1.6956869009584663, + "grad_norm": 0.1104716882109642, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2123 + }, + { + "epoch": 1.6964856230031948, + "grad_norm": 0.06538088619709015, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2124 + }, + { + "epoch": 1.6972843450479234, + "grad_norm": 0.05474448576569557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2125 + }, + { + "epoch": 1.6980830670926519, + "grad_norm": 0.0803864449262619, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2126 + }, + { + "epoch": 1.6988817891373802, + "grad_norm": 0.04384651407599449, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2127 + }, + { + "epoch": 1.6996805111821085, + "grad_norm": 0.07006746530532837, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 2128 + }, + { + "epoch": 1.700479233226837, + "grad_norm": 0.08840122073888779, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2129 + }, + { + "epoch": 1.7012779552715656, + "grad_norm": 0.06421404331922531, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2130 + }, + { + "epoch": 1.702076677316294, + "grad_norm": 0.03711751103401184, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2131 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 0.06725160032510757, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2132 + }, + { + "epoch": 1.7036741214057507, + "grad_norm": 0.0517839640378952, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2133 + }, + { + "epoch": 1.7044728434504792, + "grad_norm": 0.046399205923080444, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2134 + }, + { + "epoch": 1.7052715654952078, + "grad_norm": 0.05188435688614845, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2135 + }, + { + "epoch": 1.706070287539936, + "grad_norm": 0.08578629791736603, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2136 + }, + { + "epoch": 1.7068690095846646, + "grad_norm": 0.07895999401807785, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2137 + }, + { + "epoch": 1.707667731629393, + "grad_norm": 0.060662928968667984, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2138 + }, + { + "epoch": 1.7084664536741214, + "grad_norm": 0.08372191339731216, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2139 + }, + { + "epoch": 1.70926517571885, + "grad_norm": 0.1217966303229332, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2140 + }, + { + "epoch": 1.7100638977635783, + "grad_norm": 0.14054186642169952, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2141 + }, + { + "epoch": 1.7108626198083066, + "grad_norm": 0.11693520098924637, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2142 + }, + { + "epoch": 1.711661341853035, + "grad_norm": 0.04271163418889046, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2143 + }, + { + "epoch": 1.7124600638977636, + "grad_norm": 0.11898874491453171, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2144 + }, + { + "epoch": 1.7132587859424921, + "grad_norm": 0.2637499272823334, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2145 + }, + { + "epoch": 1.7140575079872205, + "grad_norm": 0.29218390583992004, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2146 + }, + { + "epoch": 1.7148562300319488, + "grad_norm": 0.1899375170469284, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2147 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 0.04336607828736305, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2148 + }, + { + "epoch": 1.7164536741214058, + "grad_norm": 0.14123578369617462, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2149 + }, + { + "epoch": 1.7172523961661343, + "grad_norm": 0.19930055737495422, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2150 + }, + { + "epoch": 1.7180511182108626, + "grad_norm": 0.1796298772096634, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2151 + }, + { + "epoch": 1.718849840255591, + "grad_norm": 0.07607068121433258, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2152 + }, + { + "epoch": 1.7196485623003195, + "grad_norm": 0.12980210781097412, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2153 + }, + { + "epoch": 1.720447284345048, + "grad_norm": 0.2507205009460449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2154 + }, + { + "epoch": 1.7212460063897763, + "grad_norm": 0.2388920783996582, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2155 + }, + { + "epoch": 1.7220447284345048, + "grad_norm": 0.13363847136497498, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 1.7228434504792332, + "grad_norm": 0.048030026257038116, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 2157 + }, + { + "epoch": 1.7236421725239617, + "grad_norm": 0.14619708061218262, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2158 + }, + { + "epoch": 1.7244408945686902, + "grad_norm": 0.22031216323375702, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2159 + }, + { + "epoch": 1.7252396166134185, + "grad_norm": 0.18440701067447662, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2160 + }, + { + "epoch": 1.7260383386581468, + "grad_norm": 0.08183866739273071, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2161 + }, + { + "epoch": 1.7268370607028753, + "grad_norm": 0.05314984545111656, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2162 + }, + { + "epoch": 1.7276357827476039, + "grad_norm": 0.1438753753900528, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2163 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 0.0881122425198555, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 2164 + }, + { + "epoch": 1.7292332268370607, + "grad_norm": 0.1165589690208435, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2165 + }, + { + "epoch": 1.730031948881789, + "grad_norm": 0.14884884655475616, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2166 + }, + { + "epoch": 1.7308306709265175, + "grad_norm": 0.10219287127256393, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2167 + }, + { + "epoch": 1.731629392971246, + "grad_norm": 0.059794824570417404, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2168 + }, + { + "epoch": 1.7324281150159746, + "grad_norm": 0.0538945347070694, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2169 + }, + { + "epoch": 1.733226837060703, + "grad_norm": 0.1016303226351738, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2170 + }, + { + "epoch": 1.7340255591054312, + "grad_norm": 0.058912694454193115, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2171 + }, + { + "epoch": 1.7348242811501597, + "grad_norm": 0.060018621385097504, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2172 + }, + { + "epoch": 1.7356230031948883, + "grad_norm": 0.05386706069111824, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2173 + }, + { + "epoch": 1.7364217252396166, + "grad_norm": 0.06266453117132187, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2174 + }, + { + "epoch": 1.7372204472843449, + "grad_norm": 0.1035243570804596, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 2175 + }, + { + "epoch": 1.7380191693290734, + "grad_norm": 0.17216888070106506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2176 + }, + { + "epoch": 1.738817891373802, + "grad_norm": 0.23428532481193542, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2177 + }, + { + "epoch": 1.7396166134185305, + "grad_norm": 0.21038073301315308, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2178 + }, + { + "epoch": 1.7404153354632588, + "grad_norm": 0.1487000286579132, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2179 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 0.03916196525096893, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2180 + }, + { + "epoch": 1.7420127795527156, + "grad_norm": 0.13702991604804993, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2181 + }, + { + "epoch": 1.7428115015974441, + "grad_norm": 0.21363528072834015, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2182 + }, + { + "epoch": 1.7436102236421727, + "grad_norm": 0.134271502494812, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2183 + }, + { + "epoch": 1.744408945686901, + "grad_norm": 0.062452565878629684, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2184 + }, + { + "epoch": 1.7452076677316293, + "grad_norm": 0.1745995730161667, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2185 + }, + { + "epoch": 1.7460063897763578, + "grad_norm": 0.19709894061088562, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2186 + }, + { + "epoch": 1.7468051118210863, + "grad_norm": 0.1201571598649025, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2187 + }, + { + "epoch": 1.7476038338658149, + "grad_norm": 0.03690087050199509, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2188 + }, + { + "epoch": 1.7484025559105432, + "grad_norm": 0.1387440711259842, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2189 + }, + { + "epoch": 1.7492012779552715, + "grad_norm": 0.2084781676530838, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2190 + }, + { + "epoch": 1.75, + "grad_norm": 0.17941167950630188, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2191 + }, + { + "epoch": 1.7507987220447285, + "grad_norm": 0.09751889854669571, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2192 + }, + { + "epoch": 1.7515974440894568, + "grad_norm": 0.04116421565413475, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2193 + }, + { + "epoch": 1.7523961661341851, + "grad_norm": 0.14683429896831512, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2194 + }, + { + "epoch": 1.7531948881789137, + "grad_norm": 0.19602352380752563, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2195 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 0.18503598868846893, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2196 + }, + { + "epoch": 1.7547923322683707, + "grad_norm": 0.09473808109760284, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2197 + }, + { + "epoch": 1.755591054313099, + "grad_norm": 0.05645129457116127, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2198 + }, + { + "epoch": 1.7563897763578273, + "grad_norm": 0.09260818362236023, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2199 + }, + { + "epoch": 1.7571884984025559, + "grad_norm": 0.045891985297203064, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2200 + }, + { + "epoch": 1.7579872204472844, + "grad_norm": 0.125623419880867, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2201 + }, + { + "epoch": 1.758785942492013, + "grad_norm": 0.18919512629508972, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2202 + }, + { + "epoch": 1.7595846645367412, + "grad_norm": 0.17549264430999756, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2203 + }, + { + "epoch": 1.7603833865814695, + "grad_norm": 0.047342319041490555, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 2204 + }, + { + "epoch": 1.761182108626198, + "grad_norm": 0.177268847823143, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2205 + }, + { + "epoch": 1.7619808306709266, + "grad_norm": 0.28258222341537476, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2206 + }, + { + "epoch": 1.7627795527156551, + "grad_norm": 0.25111353397369385, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2207 + }, + { + "epoch": 1.7635782747603834, + "grad_norm": 0.11864849925041199, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2208 + }, + { + "epoch": 1.7643769968051117, + "grad_norm": 0.06387785822153091, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2209 + }, + { + "epoch": 1.7651757188498403, + "grad_norm": 0.1264238804578781, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2210 + }, + { + "epoch": 1.7659744408945688, + "grad_norm": 0.12080882489681244, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2211 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.05618004873394966, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2212 + }, + { + "epoch": 1.7675718849840254, + "grad_norm": 0.06543037295341492, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2213 + }, + { + "epoch": 1.768370607028754, + "grad_norm": 0.08525256812572479, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2214 + }, + { + "epoch": 1.7691693290734825, + "grad_norm": 0.08571972697973251, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2215 + }, + { + "epoch": 1.769968051118211, + "grad_norm": 0.04897582530975342, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2216 + }, + { + "epoch": 1.7707667731629393, + "grad_norm": 0.07296427339315414, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2217 + }, + { + "epoch": 1.7715654952076676, + "grad_norm": 0.041904110461473465, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2218 + }, + { + "epoch": 1.7723642172523961, + "grad_norm": 0.053191233426332474, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2219 + }, + { + "epoch": 1.7731629392971247, + "grad_norm": 0.056369587779045105, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2220 + }, + { + "epoch": 1.7739616613418532, + "grad_norm": 0.06455157697200775, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2221 + }, + { + "epoch": 1.7747603833865815, + "grad_norm": 0.06467561423778534, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2222 + }, + { + "epoch": 1.7755591054313098, + "grad_norm": 0.07162238657474518, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2223 + }, + { + "epoch": 1.7763578274760383, + "grad_norm": 0.045193906873464584, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2224 + }, + { + "epoch": 1.7771565495207668, + "grad_norm": 0.07172992080450058, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2225 + }, + { + "epoch": 1.7779552715654952, + "grad_norm": 0.07163143157958984, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2226 + }, + { + "epoch": 1.7787539936102237, + "grad_norm": 0.11480346322059631, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2227 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 0.21525998413562775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 2228 + }, + { + "epoch": 1.7803514376996805, + "grad_norm": 0.20769886672496796, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2229 + }, + { + "epoch": 1.781150159744409, + "grad_norm": 0.13149204850196838, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2230 + }, + { + "epoch": 1.7819488817891374, + "grad_norm": 0.06223989278078079, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2231 + }, + { + "epoch": 1.7827476038338657, + "grad_norm": 0.11386150866746902, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2232 + }, + { + "epoch": 1.7835463258785942, + "grad_norm": 0.1448865532875061, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2233 + }, + { + "epoch": 1.7843450479233227, + "grad_norm": 0.11244893074035645, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2234 + }, + { + "epoch": 1.7851437699680512, + "grad_norm": 0.06307587027549744, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2235 + }, + { + "epoch": 1.7859424920127795, + "grad_norm": 0.1529018133878708, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2236 + }, + { + "epoch": 1.7867412140575079, + "grad_norm": 0.212649405002594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2237 + }, + { + "epoch": 1.7875399361022364, + "grad_norm": 0.18361856043338776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2238 + }, + { + "epoch": 1.788338658146965, + "grad_norm": 0.06960433721542358, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2239 + }, + { + "epoch": 1.7891373801916934, + "grad_norm": 0.13445821404457092, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2240 + }, + { + "epoch": 1.7899361022364217, + "grad_norm": 0.24758578836917877, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2241 + }, + { + "epoch": 1.79073482428115, + "grad_norm": 0.27208608388900757, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2242 + }, + { + "epoch": 1.7915335463258786, + "grad_norm": 0.1256505697965622, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2243 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 0.12209334224462509, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2244 + }, + { + "epoch": 1.7931309904153354, + "grad_norm": 0.2690032720565796, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2245 + }, + { + "epoch": 1.793929712460064, + "grad_norm": 0.27393221855163574, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2246 + }, + { + "epoch": 1.7947284345047922, + "grad_norm": 0.12508991360664368, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 2247 + }, + { + "epoch": 1.7955271565495208, + "grad_norm": 0.10001108795404434, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2248 + }, + { + "epoch": 1.7963258785942493, + "grad_norm": 0.2588697373867035, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2249 + }, + { + "epoch": 1.7971246006389776, + "grad_norm": 0.24723860621452332, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2250 + }, + { + "epoch": 1.797923322683706, + "grad_norm": 0.09018664062023163, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2251 + }, + { + "epoch": 1.7987220447284344, + "grad_norm": 0.09745316952466965, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 2252 + }, + { + "epoch": 1.799520766773163, + "grad_norm": 0.20877481997013092, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 2253 + }, + { + "epoch": 1.8003194888178915, + "grad_norm": 0.24291004240512848, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2254 + }, + { + "epoch": 1.8011182108626198, + "grad_norm": 0.1967754364013672, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2255 + }, + { + "epoch": 1.8019169329073481, + "grad_norm": 0.088215172290802, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2256 + }, + { + "epoch": 1.8027156549520766, + "grad_norm": 0.07018816471099854, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2257 + }, + { + "epoch": 1.8035143769968052, + "grad_norm": 0.17161858081817627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2258 + }, + { + "epoch": 1.8043130990415337, + "grad_norm": 0.22007174789905548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2259 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 0.16093726456165314, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2260 + }, + { + "epoch": 1.8059105431309903, + "grad_norm": 0.06763539463281631, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2261 + }, + { + "epoch": 1.8067092651757188, + "grad_norm": 0.1066257432103157, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2262 + }, + { + "epoch": 1.8075079872204474, + "grad_norm": 0.17658250033855438, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2263 + }, + { + "epoch": 1.8083067092651757, + "grad_norm": 0.21157506108283997, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2264 + }, + { + "epoch": 1.8091054313099042, + "grad_norm": 0.16717523336410522, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2265 + }, + { + "epoch": 1.8099041533546325, + "grad_norm": 0.08356527984142303, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2266 + }, + { + "epoch": 1.810702875399361, + "grad_norm": 0.11939100921154022, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2267 + }, + { + "epoch": 1.8115015974440896, + "grad_norm": 0.2322039157152176, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2268 + }, + { + "epoch": 1.8123003194888179, + "grad_norm": 0.2277170568704605, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2269 + }, + { + "epoch": 1.8130990415335462, + "grad_norm": 0.06634530425071716, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2270 + }, + { + "epoch": 1.8138977635782747, + "grad_norm": 0.20808424055576324, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2271 + }, + { + "epoch": 1.8146964856230032, + "grad_norm": 0.3761717975139618, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2272 + }, + { + "epoch": 1.8154952076677318, + "grad_norm": 0.3587193191051483, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2273 + }, + { + "epoch": 1.81629392971246, + "grad_norm": 0.12116564810276031, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2274 + }, + { + "epoch": 1.8170926517571884, + "grad_norm": 0.20137764513492584, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2275 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 0.30456987023353577, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2276 + }, + { + "epoch": 1.8186900958466454, + "grad_norm": 0.15625369548797607, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2277 + }, + { + "epoch": 1.819488817891374, + "grad_norm": 0.12682494521141052, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2278 + }, + { + "epoch": 1.8202875399361023, + "grad_norm": 0.26252153515815735, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2279 + }, + { + "epoch": 1.8210862619808306, + "grad_norm": 0.17610949277877808, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2280 + }, + { + "epoch": 1.821884984025559, + "grad_norm": 0.056205663830041885, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2281 + }, + { + "epoch": 1.8226837060702876, + "grad_norm": 0.1519095003604889, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2282 + }, + { + "epoch": 1.823482428115016, + "grad_norm": 0.1591203212738037, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2283 + }, + { + "epoch": 1.8242811501597445, + "grad_norm": 0.11261039227247238, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2284 + }, + { + "epoch": 1.8250798722044728, + "grad_norm": 0.06855058670043945, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2285 + }, + { + "epoch": 1.8258785942492013, + "grad_norm": 0.04728224128484726, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2286 + }, + { + "epoch": 1.8266773162939298, + "grad_norm": 0.0677042305469513, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2287 + }, + { + "epoch": 1.8274760383386581, + "grad_norm": 0.0836048573255539, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2288 + }, + { + "epoch": 1.8282747603833864, + "grad_norm": 0.0657985508441925, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2289 + }, + { + "epoch": 1.829073482428115, + "grad_norm": 0.05567999184131622, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2290 + }, + { + "epoch": 1.8298722044728435, + "grad_norm": 0.13710817694664001, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2291 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 0.14417411386966705, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2292 + }, + { + "epoch": 1.8314696485623003, + "grad_norm": 0.12273317575454712, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2293 + }, + { + "epoch": 1.8322683706070286, + "grad_norm": 0.12350328266620636, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2294 + }, + { + "epoch": 1.8330670926517572, + "grad_norm": 0.12832887470722198, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2295 + }, + { + "epoch": 1.8338658146964857, + "grad_norm": 0.17759868502616882, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2296 + }, + { + "epoch": 1.8346645367412142, + "grad_norm": 0.18485887348651886, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2297 + }, + { + "epoch": 1.8354632587859425, + "grad_norm": 0.11906488239765167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2298 + }, + { + "epoch": 1.8362619808306708, + "grad_norm": 0.04088319092988968, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2299 + }, + { + "epoch": 1.8370607028753994, + "grad_norm": 0.18988807499408722, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2300 + }, + { + "epoch": 1.8378594249201279, + "grad_norm": 0.2758033275604248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2301 + }, + { + "epoch": 1.8386581469648562, + "grad_norm": 0.26860401034355164, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2302 + }, + { + "epoch": 1.8394568690095847, + "grad_norm": 0.1770019680261612, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2303 + }, + { + "epoch": 1.840255591054313, + "grad_norm": 0.03740993142127991, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2304 + }, + { + "epoch": 1.8410543130990416, + "grad_norm": 0.13697518408298492, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2305 + }, + { + "epoch": 1.84185303514377, + "grad_norm": 0.15273790061473846, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2306 + }, + { + "epoch": 1.8426517571884984, + "grad_norm": 0.08181154727935791, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2307 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.05599624291062355, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2308 + }, + { + "epoch": 1.8442492012779552, + "grad_norm": 0.17429251968860626, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2309 + }, + { + "epoch": 1.8450479233226837, + "grad_norm": 0.20159491896629333, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2310 + }, + { + "epoch": 1.8458466453674123, + "grad_norm": 0.10825419425964355, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2311 + }, + { + "epoch": 1.8466453674121406, + "grad_norm": 0.0784185528755188, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2312 + }, + { + "epoch": 1.8474440894568689, + "grad_norm": 0.15851987898349762, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2313 + }, + { + "epoch": 1.8482428115015974, + "grad_norm": 0.11244971305131912, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2314 + }, + { + "epoch": 1.849041533546326, + "grad_norm": 0.04119047150015831, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2315 + }, + { + "epoch": 1.8498402555910545, + "grad_norm": 0.12872102856636047, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2316 + }, + { + "epoch": 1.8506389776357828, + "grad_norm": 0.1542259305715561, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2317 + }, + { + "epoch": 1.851437699680511, + "grad_norm": 0.09662868827581406, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2318 + }, + { + "epoch": 1.8522364217252396, + "grad_norm": 0.04452383890748024, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2319 + }, + { + "epoch": 1.8530351437699681, + "grad_norm": 0.03368959203362465, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2320 + }, + { + "epoch": 1.8538338658146964, + "grad_norm": 0.05867767333984375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2321 + }, + { + "epoch": 1.854632587859425, + "grad_norm": 0.0774846225976944, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2322 + }, + { + "epoch": 1.8554313099041533, + "grad_norm": 0.05172058939933777, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2323 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 0.06597824394702911, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2324 + }, + { + "epoch": 1.8570287539936103, + "grad_norm": 0.10818778723478317, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2325 + }, + { + "epoch": 1.8578274760383386, + "grad_norm": 0.12698976695537567, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2326 + }, + { + "epoch": 1.858626198083067, + "grad_norm": 0.06547659635543823, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2327 + }, + { + "epoch": 1.8594249201277955, + "grad_norm": 0.08613643050193787, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2328 + }, + { + "epoch": 1.860223642172524, + "grad_norm": 0.23452800512313843, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2329 + }, + { + "epoch": 1.8610223642172525, + "grad_norm": 0.29293227195739746, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2330 + }, + { + "epoch": 1.8618210862619808, + "grad_norm": 0.17590634524822235, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2331 + }, + { + "epoch": 1.8626198083067091, + "grad_norm": 0.09830035269260406, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2332 + }, + { + "epoch": 1.8634185303514377, + "grad_norm": 0.2336016595363617, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2333 + }, + { + "epoch": 1.8642172523961662, + "grad_norm": 0.22990736365318298, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2334 + }, + { + "epoch": 1.8650159744408947, + "grad_norm": 0.14177313446998596, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2335 + }, + { + "epoch": 1.865814696485623, + "grad_norm": 0.07447824627161026, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2336 + }, + { + "epoch": 1.8666134185303513, + "grad_norm": 0.20551882684230804, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2337 + }, + { + "epoch": 1.8674121405750799, + "grad_norm": 0.21193428337574005, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2338 + }, + { + "epoch": 1.8682108626198084, + "grad_norm": 0.09889520704746246, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2339 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 0.06506047397851944, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2340 + }, + { + "epoch": 1.869808306709265, + "grad_norm": 0.10613662004470825, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2341 + }, + { + "epoch": 1.8706070287539935, + "grad_norm": 0.13049691915512085, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2342 + }, + { + "epoch": 1.871405750798722, + "grad_norm": 0.07257628440856934, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2343 + }, + { + "epoch": 1.8722044728434506, + "grad_norm": 0.05402761325240135, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2344 + }, + { + "epoch": 1.873003194888179, + "grad_norm": 0.1298513114452362, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2345 + }, + { + "epoch": 1.8738019169329072, + "grad_norm": 0.18854250013828278, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2346 + }, + { + "epoch": 1.8746006389776357, + "grad_norm": 0.18749283254146576, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2347 + }, + { + "epoch": 1.8753993610223643, + "grad_norm": 0.0791897177696228, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2348 + }, + { + "epoch": 1.8761980830670928, + "grad_norm": 0.061554014682769775, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2349 + }, + { + "epoch": 1.876996805111821, + "grad_norm": 0.07776489108800888, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2350 + }, + { + "epoch": 1.8777955271565494, + "grad_norm": 0.06406589597463608, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2351 + }, + { + "epoch": 1.878594249201278, + "grad_norm": 0.04364178702235222, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2352 + }, + { + "epoch": 1.8793929712460065, + "grad_norm": 0.14296351373195648, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2353 + }, + { + "epoch": 1.880191693290735, + "grad_norm": 0.23554368317127228, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2354 + }, + { + "epoch": 1.8809904153354633, + "grad_norm": 0.17022013664245605, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 2355 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 0.055340252816677094, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2356 + }, + { + "epoch": 1.8825878594249201, + "grad_norm": 0.10552496463060379, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2357 + }, + { + "epoch": 1.8833865814696487, + "grad_norm": 0.1601826697587967, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2358 + }, + { + "epoch": 1.884185303514377, + "grad_norm": 0.15029270946979523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2359 + }, + { + "epoch": 1.8849840255591053, + "grad_norm": 0.05186127871274948, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2360 + }, + { + "epoch": 1.8857827476038338, + "grad_norm": 0.10678224265575409, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2361 + }, + { + "epoch": 1.8865814696485623, + "grad_norm": 0.1380450427532196, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2362 + }, + { + "epoch": 1.8873801916932909, + "grad_norm": 0.08721969276666641, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2363 + }, + { + "epoch": 1.8881789137380192, + "grad_norm": 0.09425338357686996, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2364 + }, + { + "epoch": 1.8889776357827475, + "grad_norm": 0.16815589368343353, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2365 + }, + { + "epoch": 1.889776357827476, + "grad_norm": 0.16181580722332, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2366 + }, + { + "epoch": 1.8905750798722045, + "grad_norm": 0.054028045386075974, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2367 + }, + { + "epoch": 1.891373801916933, + "grad_norm": 0.07199764251708984, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2368 + }, + { + "epoch": 1.8921725239616614, + "grad_norm": 0.08493109047412872, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2369 + }, + { + "epoch": 1.8929712460063897, + "grad_norm": 0.09665308892726898, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 2370 + }, + { + "epoch": 1.8937699680511182, + "grad_norm": 0.07975895702838898, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2371 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 0.06089888513088226, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2372 + }, + { + "epoch": 1.895367412140575, + "grad_norm": 0.04610683396458626, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2373 + }, + { + "epoch": 1.8961661341853036, + "grad_norm": 0.06083180755376816, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2374 + }, + { + "epoch": 1.8969648562300319, + "grad_norm": 0.07177560776472092, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 2375 + }, + { + "epoch": 1.8977635782747604, + "grad_norm": 0.04214467853307724, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2376 + }, + { + "epoch": 1.898562300319489, + "grad_norm": 0.05166957527399063, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2377 + }, + { + "epoch": 1.8993610223642172, + "grad_norm": 0.040181614458560944, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2378 + }, + { + "epoch": 1.9001597444089455, + "grad_norm": 0.043485358357429504, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2379 + }, + { + "epoch": 1.900958466453674, + "grad_norm": 0.07395761460065842, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2380 + }, + { + "epoch": 1.9017571884984026, + "grad_norm": 0.05133877694606781, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 2381 + }, + { + "epoch": 1.9025559105431311, + "grad_norm": 0.059279292821884155, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2382 + }, + { + "epoch": 1.9033546325878594, + "grad_norm": 0.07573487609624863, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2383 + }, + { + "epoch": 1.9041533546325877, + "grad_norm": 0.07013942301273346, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2384 + }, + { + "epoch": 1.9049520766773163, + "grad_norm": 0.14524684846401215, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2385 + }, + { + "epoch": 1.9057507987220448, + "grad_norm": 0.17374426126480103, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2386 + }, + { + "epoch": 1.9065495207667733, + "grad_norm": 0.1387263685464859, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2387 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 0.045813702046871185, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2388 + }, + { + "epoch": 1.90814696485623, + "grad_norm": 0.189321830868721, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2389 + }, + { + "epoch": 1.9089456869009584, + "grad_norm": 0.261329710483551, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2390 + }, + { + "epoch": 1.909744408945687, + "grad_norm": 0.1599399596452713, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2391 + }, + { + "epoch": 1.9105431309904153, + "grad_norm": 0.03977127745747566, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2392 + }, + { + "epoch": 1.9113418530351438, + "grad_norm": 0.16269442439079285, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2393 + }, + { + "epoch": 1.9121405750798721, + "grad_norm": 0.22963251173496246, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2394 + }, + { + "epoch": 1.9129392971246006, + "grad_norm": 0.1526031792163849, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2395 + }, + { + "epoch": 1.9137380191693292, + "grad_norm": 0.07236737757921219, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 2396 + }, + { + "epoch": 1.9145367412140575, + "grad_norm": 0.19993482530117035, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2397 + }, + { + "epoch": 1.9153354632587858, + "grad_norm": 0.18950621783733368, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2398 + }, + { + "epoch": 1.9161341853035143, + "grad_norm": 0.10046153515577316, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2399 + }, + { + "epoch": 1.9169329073482428, + "grad_norm": 0.07884453237056732, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2400 + }, + { + "epoch": 1.9177316293929714, + "grad_norm": 0.23947227001190186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2401 + }, + { + "epoch": 1.9185303514376997, + "grad_norm": 0.2662964165210724, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2402 + }, + { + "epoch": 1.919329073482428, + "grad_norm": 0.1257917582988739, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2403 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 0.09092582017183304, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2404 + }, + { + "epoch": 1.920926517571885, + "grad_norm": 0.19677215814590454, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2405 + }, + { + "epoch": 1.9217252396166136, + "grad_norm": 0.17972320318222046, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2406 + }, + { + "epoch": 1.9225239616613419, + "grad_norm": 0.06155665963888168, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2407 + }, + { + "epoch": 1.9233226837060702, + "grad_norm": 0.14805591106414795, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2408 + }, + { + "epoch": 1.9241214057507987, + "grad_norm": 0.2414662092924118, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2409 + }, + { + "epoch": 1.9249201277955272, + "grad_norm": 0.2084181308746338, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2410 + }, + { + "epoch": 1.9257188498402555, + "grad_norm": 0.05523146688938141, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2411 + }, + { + "epoch": 1.926517571884984, + "grad_norm": 0.13994552195072174, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2412 + }, + { + "epoch": 1.9273162939297124, + "grad_norm": 0.2648966312408447, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2413 + }, + { + "epoch": 1.928115015974441, + "grad_norm": 0.28959497809410095, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2414 + }, + { + "epoch": 1.9289137380191694, + "grad_norm": 0.11457488685846329, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2415 + }, + { + "epoch": 1.9297124600638977, + "grad_norm": 0.12448041886091232, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2416 + }, + { + "epoch": 1.930511182108626, + "grad_norm": 0.20807982981204987, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2417 + }, + { + "epoch": 1.9313099041533546, + "grad_norm": 0.14537623524665833, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2418 + }, + { + "epoch": 1.932108626198083, + "grad_norm": 0.0428709015250206, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2419 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.07923824340105057, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2420 + }, + { + "epoch": 1.93370607028754, + "grad_norm": 0.06046072393655777, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2421 + }, + { + "epoch": 1.9345047923322682, + "grad_norm": 0.05921380594372749, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2422 + }, + { + "epoch": 1.9353035143769968, + "grad_norm": 0.05324951559305191, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2423 + }, + { + "epoch": 1.9361022364217253, + "grad_norm": 0.060725487768650055, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2424 + }, + { + "epoch": 1.9369009584664538, + "grad_norm": 0.09305386245250702, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2425 + }, + { + "epoch": 1.9376996805111821, + "grad_norm": 0.12314888834953308, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2426 + }, + { + "epoch": 1.9384984025559104, + "grad_norm": 0.08590805530548096, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2427 + }, + { + "epoch": 1.939297124600639, + "grad_norm": 0.07134587317705154, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2428 + }, + { + "epoch": 1.9400958466453675, + "grad_norm": 0.04584966599941254, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2429 + }, + { + "epoch": 1.9408945686900958, + "grad_norm": 0.050389841198921204, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2430 + }, + { + "epoch": 1.9416932907348243, + "grad_norm": 0.055894333869218826, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2431 + }, + { + "epoch": 1.9424920127795526, + "grad_norm": 0.05231403559446335, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2432 + }, + { + "epoch": 1.9432907348242812, + "grad_norm": 0.04235154017806053, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2433 + }, + { + "epoch": 1.9440894568690097, + "grad_norm": 0.038994334638118744, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2434 + }, + { + "epoch": 1.944888178913738, + "grad_norm": 0.062291134148836136, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2435 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.10267619043588638, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2436 + }, + { + "epoch": 1.9464856230031948, + "grad_norm": 0.12227646261453629, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2437 + }, + { + "epoch": 1.9472843450479234, + "grad_norm": 0.07677904516458511, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2438 + }, + { + "epoch": 1.9480830670926519, + "grad_norm": 0.043213456869125366, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2439 + }, + { + "epoch": 1.9488817891373802, + "grad_norm": 0.0464320071041584, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2440 + }, + { + "epoch": 1.9496805111821085, + "grad_norm": 0.0488814078271389, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2441 + }, + { + "epoch": 1.950479233226837, + "grad_norm": 0.07102649658918381, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2442 + }, + { + "epoch": 1.9512779552715656, + "grad_norm": 0.056355372071266174, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2443 + }, + { + "epoch": 1.952076677316294, + "grad_norm": 0.05412770435214043, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2444 + }, + { + "epoch": 1.9528753993610224, + "grad_norm": 0.05533284693956375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2445 + }, + { + "epoch": 1.9536741214057507, + "grad_norm": 0.07065420597791672, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2446 + }, + { + "epoch": 1.9544728434504792, + "grad_norm": 0.0424923375248909, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2447 + }, + { + "epoch": 1.9552715654952078, + "grad_norm": 0.07682394236326218, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2448 + }, + { + "epoch": 1.956070287539936, + "grad_norm": 0.12305673956871033, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2449 + }, + { + "epoch": 1.9568690095846646, + "grad_norm": 0.12699945271015167, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2450 + }, + { + "epoch": 1.957667731629393, + "grad_norm": 0.09973076730966568, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2451 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 0.04687270149588585, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2452 + }, + { + "epoch": 1.95926517571885, + "grad_norm": 0.16843228042125702, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2453 + }, + { + "epoch": 1.9600638977635783, + "grad_norm": 0.27191975712776184, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2454 + }, + { + "epoch": 1.9608626198083066, + "grad_norm": 0.2563989460468292, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2455 + }, + { + "epoch": 1.961661341853035, + "grad_norm": 0.10264059901237488, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2456 + }, + { + "epoch": 1.9624600638977636, + "grad_norm": 0.12051466107368469, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2457 + }, + { + "epoch": 1.9632587859424921, + "grad_norm": 0.27400559186935425, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2458 + }, + { + "epoch": 1.9640575079872205, + "grad_norm": 0.2756473124027252, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2459 + }, + { + "epoch": 1.9648562300319488, + "grad_norm": 0.09925543516874313, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2460 + }, + { + "epoch": 1.9656549520766773, + "grad_norm": 0.18176420032978058, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2461 + }, + { + "epoch": 1.9664536741214058, + "grad_norm": 0.353693425655365, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 2462 + }, + { + "epoch": 1.9672523961661343, + "grad_norm": 0.30674099922180176, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2463 + }, + { + "epoch": 1.9680511182108626, + "grad_norm": 0.04689846560359001, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2464 + }, + { + "epoch": 1.968849840255591, + "grad_norm": 0.29758918285369873, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2465 + }, + { + "epoch": 1.9696485623003195, + "grad_norm": 0.363922655582428, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2466 + }, + { + "epoch": 1.970447284345048, + "grad_norm": 0.19258317351341248, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2467 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 0.10317967087030411, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2468 + }, + { + "epoch": 1.9720447284345048, + "grad_norm": 0.2375856637954712, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2469 + }, + { + "epoch": 1.9728434504792332, + "grad_norm": 0.13130125403404236, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2470 + }, + { + "epoch": 1.9736421725239617, + "grad_norm": 0.08131767064332962, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2471 + }, + { + "epoch": 1.9744408945686902, + "grad_norm": 0.14860530197620392, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2472 + }, + { + "epoch": 1.9752396166134185, + "grad_norm": 0.11777997016906738, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2473 + }, + { + "epoch": 1.9760383386581468, + "grad_norm": 0.08397025614976883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2474 + }, + { + "epoch": 1.9768370607028753, + "grad_norm": 0.08824057132005692, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2475 + }, + { + "epoch": 1.9776357827476039, + "grad_norm": 0.06647378206253052, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2476 + }, + { + "epoch": 1.9784345047923324, + "grad_norm": 0.038043633103370667, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2477 + }, + { + "epoch": 1.9792332268370607, + "grad_norm": 0.08245793730020523, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2478 + }, + { + "epoch": 1.980031948881789, + "grad_norm": 0.1402815282344818, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2479 + }, + { + "epoch": 1.9808306709265175, + "grad_norm": 0.15749140083789825, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2480 + }, + { + "epoch": 1.981629392971246, + "grad_norm": 0.09396994858980179, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2481 + }, + { + "epoch": 1.9824281150159746, + "grad_norm": 0.0725923553109169, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2482 + }, + { + "epoch": 1.983226837060703, + "grad_norm": 0.06790316104888916, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2483 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 0.04050496965646744, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2484 + }, + { + "epoch": 1.9848242811501597, + "grad_norm": 0.04245828837156296, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2485 + }, + { + "epoch": 1.9856230031948883, + "grad_norm": 0.04818668216466904, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2486 + }, + { + "epoch": 1.9864217252396166, + "grad_norm": 0.07091481238603592, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2487 + }, + { + "epoch": 1.9872204472843449, + "grad_norm": 0.08975768834352493, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2488 + }, + { + "epoch": 1.9880191693290734, + "grad_norm": 0.0920509397983551, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2489 + }, + { + "epoch": 1.988817891373802, + "grad_norm": 0.06188343092799187, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2490 + }, + { + "epoch": 1.9896166134185305, + "grad_norm": 0.03998660668730736, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2491 + }, + { + "epoch": 1.9904153354632588, + "grad_norm": 0.03859339654445648, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2492 + }, + { + "epoch": 1.991214057507987, + "grad_norm": 0.050228461623191833, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2493 + }, + { + "epoch": 1.9920127795527156, + "grad_norm": 0.04037710279226303, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2494 + }, + { + "epoch": 1.9928115015974441, + "grad_norm": 0.04584654048085213, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2495 + }, + { + "epoch": 1.9936102236421727, + "grad_norm": 0.03696245700120926, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2496 + }, + { + "epoch": 1.994408945686901, + "grad_norm": 0.04600491747260094, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2497 + }, + { + "epoch": 1.9952076677316293, + "grad_norm": 0.0943571925163269, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2498 + }, + { + "epoch": 1.9960063897763578, + "grad_norm": 0.11350230127573013, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2499 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.09816325455904007, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2500 + }, + { + "epoch": 1.9976038338658149, + "grad_norm": 0.05887974426150322, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2501 + }, + { + "epoch": 1.9984025559105432, + "grad_norm": 0.039232514798641205, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2502 + }, + { + "epoch": 1.9992012779552715, + "grad_norm": 0.10776908695697784, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2503 + }, + { + "epoch": 2.0, + "grad_norm": 0.1708499789237976, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2504 + }, + { + "epoch": 2.0007987220447285, + "grad_norm": 0.12712575495243073, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2505 + }, + { + "epoch": 2.001597444089457, + "grad_norm": 0.04130035266280174, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2506 + }, + { + "epoch": 2.002396166134185, + "grad_norm": 0.08062197268009186, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2507 + }, + { + "epoch": 2.0031948881789137, + "grad_norm": 0.11429931968450546, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2508 + }, + { + "epoch": 2.003993610223642, + "grad_norm": 0.06290867924690247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2509 + }, + { + "epoch": 2.0047923322683707, + "grad_norm": 0.043735455721616745, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2510 + }, + { + "epoch": 2.0055910543130993, + "grad_norm": 0.08331973850727081, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2511 + }, + { + "epoch": 2.0063897763578273, + "grad_norm": 0.07424676418304443, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2512 + }, + { + "epoch": 2.007188498402556, + "grad_norm": 0.0450097881257534, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2513 + }, + { + "epoch": 2.0079872204472844, + "grad_norm": 0.05486248433589935, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2514 + }, + { + "epoch": 2.008785942492013, + "grad_norm": 0.03456762805581093, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2515 + }, + { + "epoch": 2.009584664536741, + "grad_norm": 0.060457173734903336, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2516 + }, + { + "epoch": 2.0103833865814695, + "grad_norm": 0.11361896246671677, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2517 + }, + { + "epoch": 2.011182108626198, + "grad_norm": 0.13272768259048462, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2518 + }, + { + "epoch": 2.0119808306709266, + "grad_norm": 0.06579867750406265, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2519 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.06989869475364685, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2520 + }, + { + "epoch": 2.013578274760383, + "grad_norm": 0.10227718949317932, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2521 + }, + { + "epoch": 2.0143769968051117, + "grad_norm": 0.1155320331454277, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2522 + }, + { + "epoch": 2.0151757188498403, + "grad_norm": 0.08428250998258591, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2523 + }, + { + "epoch": 2.015974440894569, + "grad_norm": 0.07322479784488678, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2524 + }, + { + "epoch": 2.0167731629392973, + "grad_norm": 0.0683116540312767, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2525 + }, + { + "epoch": 2.0175718849840254, + "grad_norm": 0.05594201013445854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2526 + }, + { + "epoch": 2.018370607028754, + "grad_norm": 0.08582351356744766, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2527 + }, + { + "epoch": 2.0191693290734825, + "grad_norm": 0.16223077476024628, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2528 + }, + { + "epoch": 2.019968051118211, + "grad_norm": 0.23563791811466217, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2529 + }, + { + "epoch": 2.0207667731629395, + "grad_norm": 0.2101173847913742, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2530 + }, + { + "epoch": 2.0215654952076676, + "grad_norm": 0.14453741908073425, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2531 + }, + { + "epoch": 2.022364217252396, + "grad_norm": 0.050489380955696106, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2532 + }, + { + "epoch": 2.0231629392971247, + "grad_norm": 0.17723125219345093, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2533 + }, + { + "epoch": 2.023961661341853, + "grad_norm": 0.18600088357925415, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2534 + }, + { + "epoch": 2.0247603833865813, + "grad_norm": 0.10898424685001373, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2535 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.07256787270307541, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2536 + }, + { + "epoch": 2.0263578274760383, + "grad_norm": 0.1978672444820404, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2537 + }, + { + "epoch": 2.027156549520767, + "grad_norm": 0.20623594522476196, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2538 + }, + { + "epoch": 2.0279552715654954, + "grad_norm": 0.08837094157934189, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2539 + }, + { + "epoch": 2.0287539936102235, + "grad_norm": 0.10977557301521301, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2540 + }, + { + "epoch": 2.029552715654952, + "grad_norm": 0.24850067496299744, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2541 + }, + { + "epoch": 2.0303514376996805, + "grad_norm": 0.29207590222358704, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2542 + }, + { + "epoch": 2.031150159744409, + "grad_norm": 0.1985940933227539, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2543 + }, + { + "epoch": 2.0319488817891376, + "grad_norm": 0.04519326612353325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2544 + }, + { + "epoch": 2.0327476038338657, + "grad_norm": 0.16939495503902435, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2545 + }, + { + "epoch": 2.033546325878594, + "grad_norm": 0.270275354385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2546 + }, + { + "epoch": 2.0343450479233227, + "grad_norm": 0.21180108189582825, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2547 + }, + { + "epoch": 2.0351437699680512, + "grad_norm": 0.0469316728413105, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2548 + }, + { + "epoch": 2.0359424920127798, + "grad_norm": 0.1845361739397049, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2549 + }, + { + "epoch": 2.036741214057508, + "grad_norm": 0.2276308536529541, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2550 + }, + { + "epoch": 2.0375399361022364, + "grad_norm": 0.11676277965307236, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2551 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 0.1021813154220581, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2552 + }, + { + "epoch": 2.0391373801916934, + "grad_norm": 0.28504467010498047, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2553 + }, + { + "epoch": 2.0399361022364215, + "grad_norm": 0.2821798324584961, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2554 + }, + { + "epoch": 2.04073482428115, + "grad_norm": 0.09673242270946503, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2555 + }, + { + "epoch": 2.0415335463258786, + "grad_norm": 0.1784241944551468, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2556 + }, + { + "epoch": 2.042332268370607, + "grad_norm": 0.30749815702438354, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2557 + }, + { + "epoch": 2.0431309904153356, + "grad_norm": 0.2625802457332611, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2558 + }, + { + "epoch": 2.0439297124600637, + "grad_norm": 0.0651462972164154, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 2559 + }, + { + "epoch": 2.0447284345047922, + "grad_norm": 0.2103819102048874, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 2560 + }, + { + "epoch": 2.0455271565495208, + "grad_norm": 0.2854102849960327, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2561 + }, + { + "epoch": 2.0463258785942493, + "grad_norm": 0.14184293150901794, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2562 + }, + { + "epoch": 2.047124600638978, + "grad_norm": 0.06151473522186279, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2563 + }, + { + "epoch": 2.047923322683706, + "grad_norm": 0.1858600378036499, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2564 + }, + { + "epoch": 2.0487220447284344, + "grad_norm": 0.19997341930866241, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2565 + }, + { + "epoch": 2.049520766773163, + "grad_norm": 0.0924893170595169, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2566 + }, + { + "epoch": 2.0503194888178915, + "grad_norm": 0.14571507275104523, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2567 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.2566513121128082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2568 + }, + { + "epoch": 2.051916932907348, + "grad_norm": 0.24462486803531647, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2569 + }, + { + "epoch": 2.0527156549520766, + "grad_norm": 0.10544434189796448, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2570 + }, + { + "epoch": 2.053514376996805, + "grad_norm": 0.08675491809844971, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2571 + }, + { + "epoch": 2.0543130990415337, + "grad_norm": 0.18398417532444, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2572 + }, + { + "epoch": 2.055111821086262, + "grad_norm": 0.15167878568172455, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2573 + }, + { + "epoch": 2.0559105431309903, + "grad_norm": 0.06932301074266434, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2574 + }, + { + "epoch": 2.056709265175719, + "grad_norm": 0.06368319690227509, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2575 + }, + { + "epoch": 2.0575079872204474, + "grad_norm": 0.11785905808210373, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2576 + }, + { + "epoch": 2.058306709265176, + "grad_norm": 0.05494855344295502, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2577 + }, + { + "epoch": 2.059105431309904, + "grad_norm": 0.10618741810321808, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2578 + }, + { + "epoch": 2.0599041533546325, + "grad_norm": 0.14729735255241394, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2579 + }, + { + "epoch": 2.060702875399361, + "grad_norm": 0.08014677464962006, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2580 + }, + { + "epoch": 2.0615015974440896, + "grad_norm": 0.07460471242666245, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2581 + }, + { + "epoch": 2.062300319488818, + "grad_norm": 0.12884479761123657, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2582 + }, + { + "epoch": 2.063099041533546, + "grad_norm": 0.11224616318941116, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2583 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.06026687100529671, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2584 + }, + { + "epoch": 2.0646964856230032, + "grad_norm": 0.06690093874931335, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2585 + }, + { + "epoch": 2.0654952076677318, + "grad_norm": 0.10095079988241196, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2586 + }, + { + "epoch": 2.06629392971246, + "grad_norm": 0.08353506028652191, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2587 + }, + { + "epoch": 2.0670926517571884, + "grad_norm": 0.07060668617486954, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2588 + }, + { + "epoch": 2.067891373801917, + "grad_norm": 0.07298587262630463, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2589 + }, + { + "epoch": 2.0686900958466454, + "grad_norm": 0.04319034889340401, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2590 + }, + { + "epoch": 2.069488817891374, + "grad_norm": 0.04229504242539406, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2591 + }, + { + "epoch": 2.070287539936102, + "grad_norm": 0.05476998910307884, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2592 + }, + { + "epoch": 2.0710862619808306, + "grad_norm": 0.039188139140605927, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2593 + }, + { + "epoch": 2.071884984025559, + "grad_norm": 0.058993417769670486, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2594 + }, + { + "epoch": 2.0726837060702876, + "grad_norm": 0.04871759191155434, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2595 + }, + { + "epoch": 2.073482428115016, + "grad_norm": 0.037119925022125244, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2596 + }, + { + "epoch": 2.0742811501597442, + "grad_norm": 0.06476760655641556, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2597 + }, + { + "epoch": 2.0750798722044728, + "grad_norm": 0.03558475151658058, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2598 + }, + { + "epoch": 2.0758785942492013, + "grad_norm": 0.03988872841000557, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2599 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.04446236789226532, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2600 + }, + { + "epoch": 2.0774760383386583, + "grad_norm": 0.058075740933418274, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2601 + }, + { + "epoch": 2.0782747603833864, + "grad_norm": 0.10492820292711258, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2602 + }, + { + "epoch": 2.079073482428115, + "grad_norm": 0.1374005526304245, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2603 + }, + { + "epoch": 2.0798722044728435, + "grad_norm": 0.10932788252830505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 2604 + }, + { + "epoch": 2.080670926517572, + "grad_norm": 0.035826049745082855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2605 + }, + { + "epoch": 2.0814696485623, + "grad_norm": 0.10934802889823914, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2606 + }, + { + "epoch": 2.0822683706070286, + "grad_norm": 0.13302485644817352, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2607 + }, + { + "epoch": 2.083067092651757, + "grad_norm": 0.11253390461206436, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2608 + }, + { + "epoch": 2.0838658146964857, + "grad_norm": 0.04634593054652214, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2609 + }, + { + "epoch": 2.084664536741214, + "grad_norm": 0.21137909591197968, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2610 + }, + { + "epoch": 2.0854632587859423, + "grad_norm": 0.2771414816379547, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2611 + }, + { + "epoch": 2.086261980830671, + "grad_norm": 0.1959906965494156, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2612 + }, + { + "epoch": 2.0870607028753994, + "grad_norm": 0.042694322764873505, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2613 + }, + { + "epoch": 2.087859424920128, + "grad_norm": 0.15753871202468872, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2614 + }, + { + "epoch": 2.0886581469648564, + "grad_norm": 0.1917339563369751, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2615 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.05056089907884598, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2616 + }, + { + "epoch": 2.090255591054313, + "grad_norm": 0.16167999804019928, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2617 + }, + { + "epoch": 2.0910543130990416, + "grad_norm": 0.21019205451011658, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2618 + }, + { + "epoch": 2.09185303514377, + "grad_norm": 0.12859253585338593, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2619 + }, + { + "epoch": 2.0926517571884986, + "grad_norm": 0.04561556130647659, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2620 + }, + { + "epoch": 2.0934504792332267, + "grad_norm": 0.19915086030960083, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2621 + }, + { + "epoch": 2.094249201277955, + "grad_norm": 0.2792043685913086, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2622 + }, + { + "epoch": 2.0950479233226837, + "grad_norm": 0.16861289739608765, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2623 + }, + { + "epoch": 2.0958466453674123, + "grad_norm": 0.08431511372327805, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2624 + }, + { + "epoch": 2.0966453674121404, + "grad_norm": 0.26860734820365906, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2625 + }, + { + "epoch": 2.097444089456869, + "grad_norm": 0.2949545979499817, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2626 + }, + { + "epoch": 2.0982428115015974, + "grad_norm": 0.12639857828617096, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2627 + }, + { + "epoch": 2.099041533546326, + "grad_norm": 0.14675533771514893, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2628 + }, + { + "epoch": 2.0998402555910545, + "grad_norm": 0.29298654198646545, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2629 + }, + { + "epoch": 2.1006389776357826, + "grad_norm": 0.20049460232257843, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2630 + }, + { + "epoch": 2.101437699680511, + "grad_norm": 0.05280651897192001, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2631 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.2405036836862564, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2632 + }, + { + "epoch": 2.103035143769968, + "grad_norm": 0.29925718903541565, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2633 + }, + { + "epoch": 2.1038338658146967, + "grad_norm": 0.1330690085887909, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2634 + }, + { + "epoch": 2.1046325878594248, + "grad_norm": 0.11366300284862518, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2635 + }, + { + "epoch": 2.1054313099041533, + "grad_norm": 0.184611514210701, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2636 + }, + { + "epoch": 2.106230031948882, + "grad_norm": 0.0942547619342804, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2637 + }, + { + "epoch": 2.1070287539936103, + "grad_norm": 0.09224486351013184, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2638 + }, + { + "epoch": 2.107827476038339, + "grad_norm": 0.2167433351278305, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2639 + }, + { + "epoch": 2.108626198083067, + "grad_norm": 0.20001453161239624, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2640 + }, + { + "epoch": 2.1094249201277955, + "grad_norm": 0.0551394522190094, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2641 + }, + { + "epoch": 2.110223642172524, + "grad_norm": 0.14991897344589233, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2642 + }, + { + "epoch": 2.1110223642172525, + "grad_norm": 0.21038007736206055, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2643 + }, + { + "epoch": 2.1118210862619806, + "grad_norm": 0.11942024528980255, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2644 + }, + { + "epoch": 2.112619808306709, + "grad_norm": 0.14938029646873474, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2645 + }, + { + "epoch": 2.1134185303514377, + "grad_norm": 0.3405923843383789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2646 + }, + { + "epoch": 2.114217252396166, + "grad_norm": 0.3363925814628601, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2647 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.12379220873117447, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2648 + }, + { + "epoch": 2.115814696485623, + "grad_norm": 0.1583731323480606, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2649 + }, + { + "epoch": 2.1166134185303513, + "grad_norm": 0.2941076457500458, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2650 + }, + { + "epoch": 2.11741214057508, + "grad_norm": 0.18513287603855133, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2651 + }, + { + "epoch": 2.1182108626198084, + "grad_norm": 0.057797662913799286, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2652 + }, + { + "epoch": 2.119009584664537, + "grad_norm": 0.12461342662572861, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2653 + }, + { + "epoch": 2.119808306709265, + "grad_norm": 0.06276709586381912, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2654 + }, + { + "epoch": 2.1206070287539935, + "grad_norm": 0.06073528528213501, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2655 + }, + { + "epoch": 2.121405750798722, + "grad_norm": 0.07055814564228058, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2656 + }, + { + "epoch": 2.1222044728434506, + "grad_norm": 0.03508429974317551, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2657 + }, + { + "epoch": 2.123003194888179, + "grad_norm": 0.0474206916987896, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2658 + }, + { + "epoch": 2.123801916932907, + "grad_norm": 0.04067448526620865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2659 + }, + { + "epoch": 2.1246006389776357, + "grad_norm": 0.060025133192539215, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2660 + }, + { + "epoch": 2.1253993610223643, + "grad_norm": 0.061696235090494156, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2661 + }, + { + "epoch": 2.126198083067093, + "grad_norm": 0.060907844454050064, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2662 + }, + { + "epoch": 2.126996805111821, + "grad_norm": 0.06122025474905968, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 2663 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.06885300576686859, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2664 + }, + { + "epoch": 2.128594249201278, + "grad_norm": 0.047428976744413376, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2665 + }, + { + "epoch": 2.1293929712460065, + "grad_norm": 0.036644674837589264, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2666 + }, + { + "epoch": 2.130191693290735, + "grad_norm": 0.04983266070485115, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2667 + }, + { + "epoch": 2.130990415335463, + "grad_norm": 0.09072417765855789, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2668 + }, + { + "epoch": 2.1317891373801916, + "grad_norm": 0.10644412785768509, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2669 + }, + { + "epoch": 2.13258785942492, + "grad_norm": 0.07350479066371918, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2670 + }, + { + "epoch": 2.1333865814696487, + "grad_norm": 0.041709840297698975, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2671 + }, + { + "epoch": 2.134185303514377, + "grad_norm": 0.043592557311058044, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2672 + }, + { + "epoch": 2.1349840255591053, + "grad_norm": 0.04548558592796326, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2673 + }, + { + "epoch": 2.135782747603834, + "grad_norm": 0.03937267139554024, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2674 + }, + { + "epoch": 2.1365814696485623, + "grad_norm": 0.05674131214618683, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2675 + }, + { + "epoch": 2.137380191693291, + "grad_norm": 0.0857989713549614, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2676 + }, + { + "epoch": 2.1381789137380194, + "grad_norm": 0.12659871578216553, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2677 + }, + { + "epoch": 2.1389776357827475, + "grad_norm": 0.10000529885292053, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2678 + }, + { + "epoch": 2.139776357827476, + "grad_norm": 0.060805950313806534, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2679 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.20407895743846893, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2680 + }, + { + "epoch": 2.141373801916933, + "grad_norm": 0.21931609511375427, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2681 + }, + { + "epoch": 2.142172523961661, + "grad_norm": 0.0947318896651268, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2682 + }, + { + "epoch": 2.1429712460063897, + "grad_norm": 0.10082453489303589, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2683 + }, + { + "epoch": 2.143769968051118, + "grad_norm": 0.2510482370853424, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2684 + }, + { + "epoch": 2.1445686900958467, + "grad_norm": 0.2802210748195648, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 2685 + }, + { + "epoch": 2.1453674121405752, + "grad_norm": 0.18770602345466614, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2686 + }, + { + "epoch": 2.1461661341853033, + "grad_norm": 0.048588722944259644, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2687 + }, + { + "epoch": 2.146964856230032, + "grad_norm": 0.1443304419517517, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2688 + }, + { + "epoch": 2.1477635782747604, + "grad_norm": 0.22439543902873993, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2689 + }, + { + "epoch": 2.148562300319489, + "grad_norm": 0.16312581300735474, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2690 + }, + { + "epoch": 2.1493610223642174, + "grad_norm": 0.08721408247947693, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2691 + }, + { + "epoch": 2.1501597444089455, + "grad_norm": 0.2756902873516083, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2692 + }, + { + "epoch": 2.150958466453674, + "grad_norm": 0.2834199070930481, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2693 + }, + { + "epoch": 2.1517571884984026, + "grad_norm": 0.1190086081624031, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2694 + }, + { + "epoch": 2.152555910543131, + "grad_norm": 0.1246909499168396, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2695 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.2244880348443985, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2696 + }, + { + "epoch": 2.1541533546325877, + "grad_norm": 0.1424233317375183, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2697 + }, + { + "epoch": 2.1549520766773163, + "grad_norm": 0.10756697505712509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2698 + }, + { + "epoch": 2.155750798722045, + "grad_norm": 0.1688450276851654, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2699 + }, + { + "epoch": 2.1565495207667733, + "grad_norm": 0.12139362096786499, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 2700 + }, + { + "epoch": 2.1573482428115014, + "grad_norm": 0.07833441346883774, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 2701 + }, + { + "epoch": 2.15814696485623, + "grad_norm": 0.22099994122982025, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2702 + }, + { + "epoch": 2.1589456869009584, + "grad_norm": 0.190511554479599, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2703 + }, + { + "epoch": 2.159744408945687, + "grad_norm": 0.07637764513492584, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2704 + }, + { + "epoch": 2.1605431309904155, + "grad_norm": 0.06381702423095703, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2705 + }, + { + "epoch": 2.1613418530351436, + "grad_norm": 0.1343991458415985, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2706 + }, + { + "epoch": 2.162140575079872, + "grad_norm": 0.13090470433235168, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2707 + }, + { + "epoch": 2.1629392971246006, + "grad_norm": 0.04627209156751633, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2708 + }, + { + "epoch": 2.163738019169329, + "grad_norm": 0.060849517583847046, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2709 + }, + { + "epoch": 2.1645367412140577, + "grad_norm": 0.06780707836151123, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2710 + }, + { + "epoch": 2.165335463258786, + "grad_norm": 0.07282490283250809, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 2711 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 0.07168543338775635, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2712 + }, + { + "epoch": 2.166932907348243, + "grad_norm": 0.08716403692960739, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2713 + }, + { + "epoch": 2.1677316293929714, + "grad_norm": 0.09366965293884277, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2714 + }, + { + "epoch": 2.1685303514377, + "grad_norm": 0.09121392667293549, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2715 + }, + { + "epoch": 2.169329073482428, + "grad_norm": 0.06912577152252197, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2716 + }, + { + "epoch": 2.1701277955271565, + "grad_norm": 0.046476542949676514, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2717 + }, + { + "epoch": 2.170926517571885, + "grad_norm": 0.04065564647316933, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2718 + }, + { + "epoch": 2.1717252396166136, + "grad_norm": 0.044998086988925934, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2719 + }, + { + "epoch": 2.1725239616613417, + "grad_norm": 0.04588993638753891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2720 + }, + { + "epoch": 2.17332268370607, + "grad_norm": 0.05954091623425484, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2721 + }, + { + "epoch": 2.1741214057507987, + "grad_norm": 0.07627220451831818, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2722 + }, + { + "epoch": 2.1749201277955272, + "grad_norm": 0.0832771435379982, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2723 + }, + { + "epoch": 2.1757188498402558, + "grad_norm": 0.09901522845029831, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2724 + }, + { + "epoch": 2.176517571884984, + "grad_norm": 0.05773104354739189, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2725 + }, + { + "epoch": 2.1773162939297124, + "grad_norm": 0.0783318281173706, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2726 + }, + { + "epoch": 2.178115015974441, + "grad_norm": 0.12447014451026917, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2727 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.08944697678089142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2728 + }, + { + "epoch": 2.179712460063898, + "grad_norm": 0.07295451313257217, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2729 + }, + { + "epoch": 2.180511182108626, + "grad_norm": 0.1335693746805191, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2730 + }, + { + "epoch": 2.1813099041533546, + "grad_norm": 0.14618094265460968, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2731 + }, + { + "epoch": 2.182108626198083, + "grad_norm": 0.05047796294093132, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2732 + }, + { + "epoch": 2.1829073482428116, + "grad_norm": 0.18955212831497192, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2733 + }, + { + "epoch": 2.18370607028754, + "grad_norm": 0.3394540250301361, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2734 + }, + { + "epoch": 2.1845047923322682, + "grad_norm": 0.34607887268066406, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2735 + }, + { + "epoch": 2.1853035143769968, + "grad_norm": 0.19489939510822296, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2736 + }, + { + "epoch": 2.1861022364217253, + "grad_norm": 0.06775379180908203, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2737 + }, + { + "epoch": 2.186900958466454, + "grad_norm": 0.2376859039068222, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2738 + }, + { + "epoch": 2.187699680511182, + "grad_norm": 0.22686026990413666, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2739 + }, + { + "epoch": 2.1884984025559104, + "grad_norm": 0.059437282383441925, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2740 + }, + { + "epoch": 2.189297124600639, + "grad_norm": 0.184672549366951, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2741 + }, + { + "epoch": 2.1900958466453675, + "grad_norm": 0.21975156664848328, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2742 + }, + { + "epoch": 2.190894568690096, + "grad_norm": 0.08795829117298126, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2743 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.1045440062880516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2744 + }, + { + "epoch": 2.1924920127795526, + "grad_norm": 0.21037985384464264, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2745 + }, + { + "epoch": 2.193290734824281, + "grad_norm": 0.17791713774204254, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2746 + }, + { + "epoch": 2.1940894568690097, + "grad_norm": 0.06028178334236145, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2747 + }, + { + "epoch": 2.194888178913738, + "grad_norm": 0.0801217257976532, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2748 + }, + { + "epoch": 2.1956869009584663, + "grad_norm": 0.11564524471759796, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2749 + }, + { + "epoch": 2.196485623003195, + "grad_norm": 0.0652003139257431, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2750 + }, + { + "epoch": 2.1972843450479234, + "grad_norm": 0.057818979024887085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2751 + }, + { + "epoch": 2.198083067092652, + "grad_norm": 0.10466332733631134, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2752 + }, + { + "epoch": 2.1988817891373804, + "grad_norm": 0.09350129216909409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2753 + }, + { + "epoch": 2.1996805111821085, + "grad_norm": 0.04295926168560982, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2754 + }, + { + "epoch": 2.200479233226837, + "grad_norm": 0.0851534903049469, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2755 + }, + { + "epoch": 2.2012779552715656, + "grad_norm": 0.1857217401266098, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2756 + }, + { + "epoch": 2.202076677316294, + "grad_norm": 0.18267984688282013, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2757 + }, + { + "epoch": 2.202875399361022, + "grad_norm": 0.07249841094017029, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2758 + }, + { + "epoch": 2.2036741214057507, + "grad_norm": 0.14335495233535767, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2759 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.24338914453983307, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2760 + }, + { + "epoch": 2.2052715654952078, + "grad_norm": 0.17772778868675232, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2761 + }, + { + "epoch": 2.2060702875399363, + "grad_norm": 0.04809113219380379, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2762 + }, + { + "epoch": 2.2068690095846644, + "grad_norm": 0.09682228416204453, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2763 + }, + { + "epoch": 2.207667731629393, + "grad_norm": 0.13868102431297302, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2764 + }, + { + "epoch": 2.2084664536741214, + "grad_norm": 0.10956277698278427, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2765 + }, + { + "epoch": 2.20926517571885, + "grad_norm": 0.06163526698946953, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2766 + }, + { + "epoch": 2.2100638977635785, + "grad_norm": 0.14519700407981873, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2767 + }, + { + "epoch": 2.2108626198083066, + "grad_norm": 0.12486071139574051, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2768 + }, + { + "epoch": 2.211661341853035, + "grad_norm": 0.0414549857378006, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2769 + }, + { + "epoch": 2.2124600638977636, + "grad_norm": 0.13828913867473602, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2770 + }, + { + "epoch": 2.213258785942492, + "grad_norm": 0.18277914822101593, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2771 + }, + { + "epoch": 2.2140575079872207, + "grad_norm": 0.15727964043617249, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2772 + }, + { + "epoch": 2.2148562300319488, + "grad_norm": 0.07437993586063385, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2773 + }, + { + "epoch": 2.2156549520766773, + "grad_norm": 0.08192550390958786, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2774 + }, + { + "epoch": 2.216453674121406, + "grad_norm": 0.1804617941379547, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2775 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.18431466817855835, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2776 + }, + { + "epoch": 2.2180511182108624, + "grad_norm": 0.11281057447195053, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2777 + }, + { + "epoch": 2.218849840255591, + "grad_norm": 0.0398496650159359, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2778 + }, + { + "epoch": 2.2196485623003195, + "grad_norm": 0.16930198669433594, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2779 + }, + { + "epoch": 2.220447284345048, + "grad_norm": 0.2384660542011261, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2780 + }, + { + "epoch": 2.2212460063897765, + "grad_norm": 0.18867406249046326, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2781 + }, + { + "epoch": 2.2220447284345046, + "grad_norm": 0.041189488023519516, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2782 + }, + { + "epoch": 2.222843450479233, + "grad_norm": 0.21946212649345398, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2783 + }, + { + "epoch": 2.2236421725239617, + "grad_norm": 0.3394725024700165, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2784 + }, + { + "epoch": 2.22444089456869, + "grad_norm": 0.09503358602523804, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2785 + }, + { + "epoch": 2.2252396166134187, + "grad_norm": 0.180524080991745, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2786 + }, + { + "epoch": 2.226038338658147, + "grad_norm": 0.2961865961551666, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2787 + }, + { + "epoch": 2.2268370607028753, + "grad_norm": 0.25913500785827637, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2788 + }, + { + "epoch": 2.227635782747604, + "grad_norm": 0.08123381435871124, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2789 + }, + { + "epoch": 2.2284345047923324, + "grad_norm": 0.18587692081928253, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2790 + }, + { + "epoch": 2.229233226837061, + "grad_norm": 0.29838815331459045, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2791 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.2115599811077118, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2792 + }, + { + "epoch": 2.2308306709265175, + "grad_norm": 0.04708286374807358, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2793 + }, + { + "epoch": 2.231629392971246, + "grad_norm": 0.224795401096344, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2794 + }, + { + "epoch": 2.2324281150159746, + "grad_norm": 0.2673366665840149, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2795 + }, + { + "epoch": 2.2332268370607027, + "grad_norm": 0.1223720833659172, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2796 + }, + { + "epoch": 2.234025559105431, + "grad_norm": 0.12798862159252167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2797 + }, + { + "epoch": 2.2348242811501597, + "grad_norm": 0.25721317529678345, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2798 + }, + { + "epoch": 2.2356230031948883, + "grad_norm": 0.16970157623291016, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2799 + }, + { + "epoch": 2.236421725239617, + "grad_norm": 0.1311950534582138, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 2800 + }, + { + "epoch": 2.237220447284345, + "grad_norm": 0.32154732942581177, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2801 + }, + { + "epoch": 2.2380191693290734, + "grad_norm": 0.23601645231246948, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2802 + }, + { + "epoch": 2.238817891373802, + "grad_norm": 0.08307314664125443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2803 + }, + { + "epoch": 2.2396166134185305, + "grad_norm": 0.31183329224586487, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 2804 + }, + { + "epoch": 2.2404153354632586, + "grad_norm": 0.27391767501831055, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2805 + }, + { + "epoch": 2.241214057507987, + "grad_norm": 0.07247646898031235, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2806 + }, + { + "epoch": 2.2420127795527156, + "grad_norm": 0.1882690042257309, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2807 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.18179158866405487, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2808 + }, + { + "epoch": 2.2436102236421727, + "grad_norm": 0.10761548578739166, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2809 + }, + { + "epoch": 2.244408945686901, + "grad_norm": 0.3067700266838074, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2810 + }, + { + "epoch": 2.2452076677316293, + "grad_norm": 0.17450691759586334, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 2811 + }, + { + "epoch": 2.246006389776358, + "grad_norm": 0.14480780065059662, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2812 + }, + { + "epoch": 2.2468051118210863, + "grad_norm": 0.3325321078300476, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2813 + }, + { + "epoch": 2.247603833865815, + "grad_norm": 0.26238250732421875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2814 + }, + { + "epoch": 2.248402555910543, + "grad_norm": 0.07829522341489792, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2815 + }, + { + "epoch": 2.2492012779552715, + "grad_norm": 0.269721657037735, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2816 + }, + { + "epoch": 2.25, + "grad_norm": 0.16362956166267395, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2817 + }, + { + "epoch": 2.2507987220447285, + "grad_norm": 0.08129733055830002, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2818 + }, + { + "epoch": 2.251597444089457, + "grad_norm": 0.18430721759796143, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2819 + }, + { + "epoch": 2.252396166134185, + "grad_norm": 0.09634844213724136, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2820 + }, + { + "epoch": 2.2531948881789137, + "grad_norm": 0.08204549551010132, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2821 + }, + { + "epoch": 2.253993610223642, + "grad_norm": 0.1140882819890976, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2822 + }, + { + "epoch": 2.2547923322683707, + "grad_norm": 0.05056345462799072, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2823 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.06505320966243744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2824 + }, + { + "epoch": 2.2563897763578273, + "grad_norm": 0.11316727101802826, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2825 + }, + { + "epoch": 2.257188498402556, + "grad_norm": 0.1036633774638176, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2826 + }, + { + "epoch": 2.2579872204472844, + "grad_norm": 0.0470670685172081, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 2827 + }, + { + "epoch": 2.258785942492013, + "grad_norm": 0.0880327895283699, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2828 + }, + { + "epoch": 2.2595846645367414, + "grad_norm": 0.07664912939071655, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2829 + }, + { + "epoch": 2.2603833865814695, + "grad_norm": 0.049471575766801834, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2830 + }, + { + "epoch": 2.261182108626198, + "grad_norm": 0.04288775101304054, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2831 + }, + { + "epoch": 2.2619808306709266, + "grad_norm": 0.10124537348747253, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2832 + }, + { + "epoch": 2.262779552715655, + "grad_norm": 0.13865061104297638, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2833 + }, + { + "epoch": 2.263578274760383, + "grad_norm": 0.10227467864751816, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2834 + }, + { + "epoch": 2.2643769968051117, + "grad_norm": 0.050575822591781616, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 2835 + }, + { + "epoch": 2.2651757188498403, + "grad_norm": 0.044946715235710144, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2836 + }, + { + "epoch": 2.265974440894569, + "grad_norm": 0.0712895616889, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2837 + }, + { + "epoch": 2.2667731629392973, + "grad_norm": 0.07044374942779541, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2838 + }, + { + "epoch": 2.2675718849840254, + "grad_norm": 0.04518461972475052, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2839 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 0.05259617418050766, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2840 + }, + { + "epoch": 2.2691693290734825, + "grad_norm": 0.0654863640666008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2841 + }, + { + "epoch": 2.269968051118211, + "grad_norm": 0.04345248267054558, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2842 + }, + { + "epoch": 2.270766773162939, + "grad_norm": 0.057224296033382416, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2843 + }, + { + "epoch": 2.2715654952076676, + "grad_norm": 0.11091717332601547, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2844 + }, + { + "epoch": 2.272364217252396, + "grad_norm": 0.11426062136888504, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2845 + }, + { + "epoch": 2.2731629392971247, + "grad_norm": 0.10064966231584549, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2846 + }, + { + "epoch": 2.273961661341853, + "grad_norm": 0.13716623187065125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2847 + }, + { + "epoch": 2.2747603833865817, + "grad_norm": 0.09014318138360977, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2848 + }, + { + "epoch": 2.27555910543131, + "grad_norm": 0.16652478277683258, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2849 + }, + { + "epoch": 2.2763578274760383, + "grad_norm": 0.14217601716518402, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2850 + }, + { + "epoch": 2.277156549520767, + "grad_norm": 0.03895508497953415, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2851 + }, + { + "epoch": 2.2779552715654954, + "grad_norm": 0.17713558673858643, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2852 + }, + { + "epoch": 2.2787539936102235, + "grad_norm": 0.32960572838783264, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 2853 + }, + { + "epoch": 2.279552715654952, + "grad_norm": 0.2481910139322281, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2854 + }, + { + "epoch": 2.2803514376996805, + "grad_norm": 0.06643390655517578, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2855 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.17466357350349426, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2856 + }, + { + "epoch": 2.2819488817891376, + "grad_norm": 0.27781131863594055, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2857 + }, + { + "epoch": 2.2827476038338657, + "grad_norm": 0.19475431740283966, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2858 + }, + { + "epoch": 2.283546325878594, + "grad_norm": 0.07700221985578537, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 2859 + }, + { + "epoch": 2.2843450479233227, + "grad_norm": 0.22520926594734192, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2860 + }, + { + "epoch": 2.2851437699680512, + "grad_norm": 0.18735183775424957, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2861 + }, + { + "epoch": 2.2859424920127793, + "grad_norm": 0.04133198782801628, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2862 + }, + { + "epoch": 2.286741214057508, + "grad_norm": 0.2526150941848755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2863 + }, + { + "epoch": 2.2875399361022364, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2864 + }, + { + "epoch": 2.288338658146965, + "grad_norm": 0.12839898467063904, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2865 + }, + { + "epoch": 2.2891373801916934, + "grad_norm": 0.1259411871433258, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2866 + }, + { + "epoch": 2.289936102236422, + "grad_norm": 0.25480905175209045, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2867 + }, + { + "epoch": 2.29073482428115, + "grad_norm": 0.15650653839111328, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2868 + }, + { + "epoch": 2.2915335463258786, + "grad_norm": 0.07474946230649948, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2869 + }, + { + "epoch": 2.292332268370607, + "grad_norm": 0.170192688703537, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2870 + }, + { + "epoch": 2.2931309904153356, + "grad_norm": 0.13292376697063446, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2871 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 0.045553866773843765, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2872 + }, + { + "epoch": 2.2947284345047922, + "grad_norm": 0.10853269696235657, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2873 + }, + { + "epoch": 2.2955271565495208, + "grad_norm": 0.09945288300514221, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2874 + }, + { + "epoch": 2.2963258785942493, + "grad_norm": 0.039073117077350616, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2875 + }, + { + "epoch": 2.297124600638978, + "grad_norm": 0.05867530405521393, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2876 + }, + { + "epoch": 2.297923322683706, + "grad_norm": 0.07227179408073425, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2877 + }, + { + "epoch": 2.2987220447284344, + "grad_norm": 0.04456201195716858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2878 + }, + { + "epoch": 2.299520766773163, + "grad_norm": 0.11672481894493103, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2879 + }, + { + "epoch": 2.3003194888178915, + "grad_norm": 0.12335679680109024, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2880 + }, + { + "epoch": 2.3011182108626196, + "grad_norm": 0.043409012258052826, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2881 + }, + { + "epoch": 2.301916932907348, + "grad_norm": 0.09896806627511978, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2882 + }, + { + "epoch": 2.3027156549520766, + "grad_norm": 0.2037963569164276, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2883 + }, + { + "epoch": 2.303514376996805, + "grad_norm": 0.21378903090953827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2884 + }, + { + "epoch": 2.3043130990415337, + "grad_norm": 0.062362927943468094, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2885 + }, + { + "epoch": 2.3051118210862622, + "grad_norm": 0.17370136082172394, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2886 + }, + { + "epoch": 2.3059105431309903, + "grad_norm": 0.23190435767173767, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2887 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.08148342370986938, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2888 + }, + { + "epoch": 2.3075079872204474, + "grad_norm": 0.1596807837486267, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2889 + }, + { + "epoch": 2.308306709265176, + "grad_norm": 0.26396819949150085, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2890 + }, + { + "epoch": 2.309105431309904, + "grad_norm": 0.1509561687707901, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2891 + }, + { + "epoch": 2.3099041533546325, + "grad_norm": 0.09147104620933533, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2892 + }, + { + "epoch": 2.310702875399361, + "grad_norm": 0.23575374484062195, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2893 + }, + { + "epoch": 2.3115015974440896, + "grad_norm": 0.18403767049312592, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2894 + }, + { + "epoch": 2.312300319488818, + "grad_norm": 0.052600763738155365, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2895 + }, + { + "epoch": 2.313099041533546, + "grad_norm": 0.18707415461540222, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2896 + }, + { + "epoch": 2.3138977635782747, + "grad_norm": 0.20824143290519714, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2897 + }, + { + "epoch": 2.3146964856230032, + "grad_norm": 0.0775759220123291, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2898 + }, + { + "epoch": 2.3154952076677318, + "grad_norm": 0.10904766619205475, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2899 + }, + { + "epoch": 2.31629392971246, + "grad_norm": 0.1562514752149582, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2900 + }, + { + "epoch": 2.3170926517571884, + "grad_norm": 0.06689859926700592, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2901 + }, + { + "epoch": 2.317891373801917, + "grad_norm": 0.0887206643819809, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2902 + }, + { + "epoch": 2.3186900958466454, + "grad_norm": 0.13615944981575012, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2903 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.08094146102666855, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2904 + }, + { + "epoch": 2.3202875399361025, + "grad_norm": 0.06734368950128555, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2905 + }, + { + "epoch": 2.3210862619808306, + "grad_norm": 0.17405667901039124, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2906 + }, + { + "epoch": 2.321884984025559, + "grad_norm": 0.23022079467773438, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2907 + }, + { + "epoch": 2.3226837060702876, + "grad_norm": 0.17341896891593933, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2908 + }, + { + "epoch": 2.323482428115016, + "grad_norm": 0.037751875817775726, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2909 + }, + { + "epoch": 2.3242811501597442, + "grad_norm": 0.12434598803520203, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2910 + }, + { + "epoch": 2.3250798722044728, + "grad_norm": 0.11344511806964874, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2911 + }, + { + "epoch": 2.3258785942492013, + "grad_norm": 0.05426390469074249, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2912 + }, + { + "epoch": 2.32667731629393, + "grad_norm": 0.11261611431837082, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2913 + }, + { + "epoch": 2.3274760383386583, + "grad_norm": 0.22023531794548035, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2914 + }, + { + "epoch": 2.3282747603833864, + "grad_norm": 0.2050291895866394, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2915 + }, + { + "epoch": 2.329073482428115, + "grad_norm": 0.05478905141353607, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2916 + }, + { + "epoch": 2.3298722044728435, + "grad_norm": 0.15363283455371857, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2917 + }, + { + "epoch": 2.330670926517572, + "grad_norm": 0.17348943650722504, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2918 + }, + { + "epoch": 2.3314696485623, + "grad_norm": 0.05366649851202965, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2919 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 0.16219462454319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2920 + }, + { + "epoch": 2.333067092651757, + "grad_norm": 0.23911446332931519, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2921 + }, + { + "epoch": 2.3338658146964857, + "grad_norm": 0.12384039163589478, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2922 + }, + { + "epoch": 2.334664536741214, + "grad_norm": 0.08747945725917816, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 2923 + }, + { + "epoch": 2.3354632587859427, + "grad_norm": 0.19737359881401062, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2924 + }, + { + "epoch": 2.336261980830671, + "grad_norm": 0.11312227696180344, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2925 + }, + { + "epoch": 2.3370607028753994, + "grad_norm": 0.09944877028465271, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2926 + }, + { + "epoch": 2.337859424920128, + "grad_norm": 0.23282872140407562, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2927 + }, + { + "epoch": 2.3386581469648564, + "grad_norm": 0.14369411766529083, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2928 + }, + { + "epoch": 2.3394568690095845, + "grad_norm": 0.07267388701438904, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2929 + }, + { + "epoch": 2.340255591054313, + "grad_norm": 0.18751965463161469, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2930 + }, + { + "epoch": 2.3410543130990416, + "grad_norm": 0.20886634290218353, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2931 + }, + { + "epoch": 2.34185303514377, + "grad_norm": 0.11675436794757843, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2932 + }, + { + "epoch": 2.3426517571884986, + "grad_norm": 0.08915580064058304, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2933 + }, + { + "epoch": 2.3434504792332267, + "grad_norm": 0.1534406840801239, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2934 + }, + { + "epoch": 2.344249201277955, + "grad_norm": 0.08791724592447281, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2935 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 0.04647858813405037, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2936 + }, + { + "epoch": 2.3458466453674123, + "grad_norm": 0.09236840158700943, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2937 + }, + { + "epoch": 2.3466453674121404, + "grad_norm": 0.09079006314277649, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2938 + }, + { + "epoch": 2.347444089456869, + "grad_norm": 0.03492455556988716, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2939 + }, + { + "epoch": 2.3482428115015974, + "grad_norm": 0.11871617287397385, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2940 + }, + { + "epoch": 2.349041533546326, + "grad_norm": 0.10904752463102341, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2941 + }, + { + "epoch": 2.3498402555910545, + "grad_norm": 0.05331781879067421, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2942 + }, + { + "epoch": 2.3506389776357826, + "grad_norm": 0.1213313564658165, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2943 + }, + { + "epoch": 2.351437699680511, + "grad_norm": 0.12995922565460205, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 2944 + }, + { + "epoch": 2.3522364217252396, + "grad_norm": 0.05770767107605934, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2945 + }, + { + "epoch": 2.353035143769968, + "grad_norm": 0.09310754388570786, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 2946 + }, + { + "epoch": 2.3538338658146967, + "grad_norm": 0.17539645731449127, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2947 + }, + { + "epoch": 2.3546325878594248, + "grad_norm": 0.14126333594322205, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2948 + }, + { + "epoch": 2.3554313099041533, + "grad_norm": 0.04220091179013252, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2949 + }, + { + "epoch": 2.356230031948882, + "grad_norm": 0.14341594278812408, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2950 + }, + { + "epoch": 2.3570287539936103, + "grad_norm": 0.13884525001049042, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2951 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 0.040859755128622055, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2952 + }, + { + "epoch": 2.358626198083067, + "grad_norm": 0.14475658535957336, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 2953 + }, + { + "epoch": 2.3594249201277955, + "grad_norm": 0.18962377309799194, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2954 + }, + { + "epoch": 2.360223642172524, + "grad_norm": 0.0909075066447258, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2955 + }, + { + "epoch": 2.3610223642172525, + "grad_norm": 0.08225106447935104, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2956 + }, + { + "epoch": 2.3618210862619806, + "grad_norm": 0.1564486026763916, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2957 + }, + { + "epoch": 2.362619808306709, + "grad_norm": 0.08859751373529434, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2958 + }, + { + "epoch": 2.3634185303514377, + "grad_norm": 0.10907880961894989, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2959 + }, + { + "epoch": 2.364217252396166, + "grad_norm": 0.2368745654821396, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2960 + }, + { + "epoch": 2.3650159744408947, + "grad_norm": 0.15427371859550476, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2961 + }, + { + "epoch": 2.365814696485623, + "grad_norm": 0.07661470025777817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2962 + }, + { + "epoch": 2.3666134185303513, + "grad_norm": 0.2368732988834381, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2963 + }, + { + "epoch": 2.36741214057508, + "grad_norm": 0.24830125272274017, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2964 + }, + { + "epoch": 2.3682108626198084, + "grad_norm": 0.06940490007400513, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2965 + }, + { + "epoch": 2.369009584664537, + "grad_norm": 0.18672171235084534, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 2966 + }, + { + "epoch": 2.369808306709265, + "grad_norm": 0.22521120309829712, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2967 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 0.0496690534055233, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2968 + }, + { + "epoch": 2.371405750798722, + "grad_norm": 0.16735650599002838, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2969 + }, + { + "epoch": 2.3722044728434506, + "grad_norm": 0.18583746254444122, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2970 + }, + { + "epoch": 2.373003194888179, + "grad_norm": 0.03828646242618561, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2971 + }, + { + "epoch": 2.373801916932907, + "grad_norm": 0.14302043616771698, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2972 + }, + { + "epoch": 2.3746006389776357, + "grad_norm": 0.14217248558998108, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2973 + }, + { + "epoch": 2.3753993610223643, + "grad_norm": 0.08656741678714752, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2974 + }, + { + "epoch": 2.376198083067093, + "grad_norm": 0.18724001944065094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2975 + }, + { + "epoch": 2.376996805111821, + "grad_norm": 0.21609556674957275, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2976 + }, + { + "epoch": 2.3777955271565494, + "grad_norm": 0.08098721504211426, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2977 + }, + { + "epoch": 2.378594249201278, + "grad_norm": 0.09842941910028458, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2978 + }, + { + "epoch": 2.3793929712460065, + "grad_norm": 0.14060764014720917, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2979 + }, + { + "epoch": 2.380191693290735, + "grad_norm": 0.063141830265522, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2980 + }, + { + "epoch": 2.380990415335463, + "grad_norm": 0.10411619395017624, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2981 + }, + { + "epoch": 2.3817891373801916, + "grad_norm": 0.15445855259895325, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2982 + }, + { + "epoch": 2.38258785942492, + "grad_norm": 0.07754000276327133, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2983 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 0.05312122777104378, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2984 + }, + { + "epoch": 2.384185303514377, + "grad_norm": 0.09916596859693527, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2985 + }, + { + "epoch": 2.3849840255591053, + "grad_norm": 0.12749150395393372, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2986 + }, + { + "epoch": 2.385782747603834, + "grad_norm": 0.054589178413152695, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2987 + }, + { + "epoch": 2.3865814696485623, + "grad_norm": 0.08480732887983322, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2988 + }, + { + "epoch": 2.387380191693291, + "grad_norm": 0.13158805668354034, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 2989 + }, + { + "epoch": 2.3881789137380194, + "grad_norm": 0.11916540563106537, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2990 + }, + { + "epoch": 2.3889776357827475, + "grad_norm": 0.05829031020402908, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2991 + }, + { + "epoch": 2.389776357827476, + "grad_norm": 0.18292354047298431, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2992 + }, + { + "epoch": 2.3905750798722045, + "grad_norm": 0.18494512140750885, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2993 + }, + { + "epoch": 2.391373801916933, + "grad_norm": 0.06371760368347168, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2994 + }, + { + "epoch": 2.392172523961661, + "grad_norm": 0.10157672315835953, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2995 + }, + { + "epoch": 2.3929712460063897, + "grad_norm": 0.13981172442436218, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2996 + }, + { + "epoch": 2.393769968051118, + "grad_norm": 0.07794835418462753, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2997 + }, + { + "epoch": 2.3945686900958467, + "grad_norm": 0.038293492048978806, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2998 + }, + { + "epoch": 2.3953674121405752, + "grad_norm": 0.06315408647060394, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2999 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 0.045907966792583466, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3000 + }, + { + "epoch": 2.396964856230032, + "grad_norm": 0.038717497140169144, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3001 + }, + { + "epoch": 2.3977635782747604, + "grad_norm": 0.0376095287501812, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3002 + }, + { + "epoch": 2.398562300319489, + "grad_norm": 0.05739009007811546, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3003 + }, + { + "epoch": 2.3993610223642174, + "grad_norm": 0.034832656383514404, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3004 + }, + { + "epoch": 2.4001597444089455, + "grad_norm": 0.06432276219129562, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3005 + }, + { + "epoch": 2.400958466453674, + "grad_norm": 0.05443817004561424, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3006 + }, + { + "epoch": 2.4017571884984026, + "grad_norm": 0.04691087454557419, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3007 + }, + { + "epoch": 2.402555910543131, + "grad_norm": 0.04394471272826195, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3008 + }, + { + "epoch": 2.4033546325878596, + "grad_norm": 0.03642019256949425, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3009 + }, + { + "epoch": 2.4041533546325877, + "grad_norm": 0.05891808122396469, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3010 + }, + { + "epoch": 2.4049520766773163, + "grad_norm": 0.04530616104602814, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3011 + }, + { + "epoch": 2.405750798722045, + "grad_norm": 0.0518258772790432, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3012 + }, + { + "epoch": 2.4065495207667733, + "grad_norm": 0.11279664188623428, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3013 + }, + { + "epoch": 2.4073482428115014, + "grad_norm": 0.10047753900289536, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3014 + }, + { + "epoch": 2.40814696485623, + "grad_norm": 0.06645897775888443, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3015 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 0.03372915834188461, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3016 + }, + { + "epoch": 2.409744408945687, + "grad_norm": 0.05353475734591484, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3017 + }, + { + "epoch": 2.4105431309904155, + "grad_norm": 0.038493942469358444, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3018 + }, + { + "epoch": 2.4113418530351436, + "grad_norm": 0.07303082197904587, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3019 + }, + { + "epoch": 2.412140575079872, + "grad_norm": 0.043219298124313354, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3020 + }, + { + "epoch": 2.4129392971246006, + "grad_norm": 0.05016458407044411, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3021 + }, + { + "epoch": 2.413738019169329, + "grad_norm": 0.08490880578756332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3022 + }, + { + "epoch": 2.4145367412140573, + "grad_norm": 0.07245411723852158, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3023 + }, + { + "epoch": 2.415335463258786, + "grad_norm": 0.052343063056468964, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3024 + }, + { + "epoch": 2.4161341853035143, + "grad_norm": 0.13449524343013763, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3025 + }, + { + "epoch": 2.416932907348243, + "grad_norm": 0.13177144527435303, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3026 + }, + { + "epoch": 2.4177316293929714, + "grad_norm": 0.06579594314098358, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 3027 + }, + { + "epoch": 2.4185303514377, + "grad_norm": 0.12716646492481232, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3028 + }, + { + "epoch": 2.419329073482428, + "grad_norm": 0.20006005465984344, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3029 + }, + { + "epoch": 2.4201277955271565, + "grad_norm": 0.16598355770111084, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3030 + }, + { + "epoch": 2.420926517571885, + "grad_norm": 0.06625109165906906, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3031 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.10521841049194336, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3032 + }, + { + "epoch": 2.4225239616613417, + "grad_norm": 0.14134426414966583, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3033 + }, + { + "epoch": 2.42332268370607, + "grad_norm": 0.056669678539037704, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3034 + }, + { + "epoch": 2.4241214057507987, + "grad_norm": 0.052738044410943985, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3035 + }, + { + "epoch": 2.4249201277955272, + "grad_norm": 0.06623729318380356, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3036 + }, + { + "epoch": 2.4257188498402558, + "grad_norm": 0.04038512706756592, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3037 + }, + { + "epoch": 2.426517571884984, + "grad_norm": 0.057600609958171844, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3038 + }, + { + "epoch": 2.4273162939297124, + "grad_norm": 0.08174199610948563, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3039 + }, + { + "epoch": 2.428115015974441, + "grad_norm": 0.07850457727909088, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3040 + }, + { + "epoch": 2.4289137380191694, + "grad_norm": 0.04368523135781288, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3041 + }, + { + "epoch": 2.4297124600638975, + "grad_norm": 0.11637478321790695, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3042 + }, + { + "epoch": 2.430511182108626, + "grad_norm": 0.09765078872442245, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3043 + }, + { + "epoch": 2.4313099041533546, + "grad_norm": 0.04842933267354965, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3044 + }, + { + "epoch": 2.432108626198083, + "grad_norm": 0.08858928829431534, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3045 + }, + { + "epoch": 2.4329073482428116, + "grad_norm": 0.12645326554775238, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3046 + }, + { + "epoch": 2.43370607028754, + "grad_norm": 0.09839878976345062, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 3047 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 0.04484904557466507, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3048 + }, + { + "epoch": 2.4353035143769968, + "grad_norm": 0.13912586867809296, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3049 + }, + { + "epoch": 2.4361022364217253, + "grad_norm": 0.18569444119930267, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3050 + }, + { + "epoch": 2.436900958466454, + "grad_norm": 0.13544169068336487, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3051 + }, + { + "epoch": 2.437699680511182, + "grad_norm": 0.04663483425974846, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3052 + }, + { + "epoch": 2.4384984025559104, + "grad_norm": 0.11609578132629395, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3053 + }, + { + "epoch": 2.439297124600639, + "grad_norm": 0.17497499287128448, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3054 + }, + { + "epoch": 2.4400958466453675, + "grad_norm": 0.19216352701187134, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3055 + }, + { + "epoch": 2.440894568690096, + "grad_norm": 0.11638841032981873, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3056 + }, + { + "epoch": 2.441693290734824, + "grad_norm": 0.05816149711608887, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3057 + }, + { + "epoch": 2.4424920127795526, + "grad_norm": 0.1650087982416153, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3058 + }, + { + "epoch": 2.443290734824281, + "grad_norm": 0.2105383425951004, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3059 + }, + { + "epoch": 2.4440894568690097, + "grad_norm": 0.133597731590271, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3060 + }, + { + "epoch": 2.4448881789137378, + "grad_norm": 0.03882076218724251, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3061 + }, + { + "epoch": 2.4456869009584663, + "grad_norm": 0.08914566785097122, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3062 + }, + { + "epoch": 2.446485623003195, + "grad_norm": 0.08115291595458984, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3063 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 0.0402134470641613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3064 + }, + { + "epoch": 2.448083067092652, + "grad_norm": 0.12838906049728394, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3065 + }, + { + "epoch": 2.4488817891373804, + "grad_norm": 0.1865018606185913, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3066 + }, + { + "epoch": 2.4496805111821085, + "grad_norm": 0.13134929537773132, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3067 + }, + { + "epoch": 2.450479233226837, + "grad_norm": 0.05415928363800049, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3068 + }, + { + "epoch": 2.4512779552715656, + "grad_norm": 0.0739838033914566, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3069 + }, + { + "epoch": 2.452076677316294, + "grad_norm": 0.07965957373380661, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 3070 + }, + { + "epoch": 2.452875399361022, + "grad_norm": 0.0416380800306797, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3071 + }, + { + "epoch": 2.4536741214057507, + "grad_norm": 0.03494519367814064, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3072 + }, + { + "epoch": 2.4544728434504792, + "grad_norm": 0.050772733986377716, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3073 + }, + { + "epoch": 2.4552715654952078, + "grad_norm": 0.03939373791217804, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3074 + }, + { + "epoch": 2.4560702875399363, + "grad_norm": 0.11769624799489975, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3075 + }, + { + "epoch": 2.4568690095846644, + "grad_norm": 0.33884114027023315, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3076 + }, + { + "epoch": 2.457667731629393, + "grad_norm": 0.07171089947223663, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3077 + }, + { + "epoch": 2.4584664536741214, + "grad_norm": 0.0707232877612114, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3078 + }, + { + "epoch": 2.45926517571885, + "grad_norm": 0.14245279133319855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3079 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.12356095761060715, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3080 + }, + { + "epoch": 2.4608626198083066, + "grad_norm": 0.0694037601351738, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3081 + }, + { + "epoch": 2.461661341853035, + "grad_norm": 0.0511220321059227, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3082 + }, + { + "epoch": 2.4624600638977636, + "grad_norm": 0.10915348678827286, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3083 + }, + { + "epoch": 2.463258785942492, + "grad_norm": 0.10797106474637985, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3084 + }, + { + "epoch": 2.4640575079872207, + "grad_norm": 0.05721200630068779, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3085 + }, + { + "epoch": 2.4648562300319488, + "grad_norm": 0.04477681592106819, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3086 + }, + { + "epoch": 2.4656549520766773, + "grad_norm": 0.08826448023319244, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3087 + }, + { + "epoch": 2.466453674121406, + "grad_norm": 0.1024692952632904, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3088 + }, + { + "epoch": 2.4672523961661343, + "grad_norm": 0.06543146073818207, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3089 + }, + { + "epoch": 2.4680511182108624, + "grad_norm": 0.06146182119846344, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3090 + }, + { + "epoch": 2.468849840255591, + "grad_norm": 0.12857408821582794, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3091 + }, + { + "epoch": 2.4696485623003195, + "grad_norm": 0.12273124605417252, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3092 + }, + { + "epoch": 2.470447284345048, + "grad_norm": 0.06467662751674652, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3093 + }, + { + "epoch": 2.4712460063897765, + "grad_norm": 0.07181179523468018, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3094 + }, + { + "epoch": 2.4720447284345046, + "grad_norm": 0.20223456621170044, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3095 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 0.25061357021331787, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3096 + }, + { + "epoch": 2.4736421725239617, + "grad_norm": 0.16317492723464966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3097 + }, + { + "epoch": 2.47444089456869, + "grad_norm": 0.04005994647741318, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3098 + }, + { + "epoch": 2.4752396166134183, + "grad_norm": 0.15954583883285522, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3099 + }, + { + "epoch": 2.476038338658147, + "grad_norm": 0.2088920623064041, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3100 + }, + { + "epoch": 2.4768370607028753, + "grad_norm": 0.11643055826425552, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3101 + }, + { + "epoch": 2.477635782747604, + "grad_norm": 0.11083687841892242, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3102 + }, + { + "epoch": 2.4784345047923324, + "grad_norm": 0.24777425825595856, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3103 + }, + { + "epoch": 2.479233226837061, + "grad_norm": 0.19513146579265594, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3104 + }, + { + "epoch": 2.480031948881789, + "grad_norm": 0.05009200796484947, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3105 + }, + { + "epoch": 2.4808306709265175, + "grad_norm": 0.2673046588897705, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3106 + }, + { + "epoch": 2.481629392971246, + "grad_norm": 0.3035629093647003, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3107 + }, + { + "epoch": 2.4824281150159746, + "grad_norm": 0.13213352859020233, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3108 + }, + { + "epoch": 2.4832268370607027, + "grad_norm": 0.13605083525180817, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3109 + }, + { + "epoch": 2.484025559105431, + "grad_norm": 0.2958623170852661, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3110 + }, + { + "epoch": 2.4848242811501597, + "grad_norm": 0.23080390691757202, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3111 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 0.046950701624155045, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3112 + }, + { + "epoch": 2.486421725239617, + "grad_norm": 0.24903765320777893, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3113 + }, + { + "epoch": 2.487220447284345, + "grad_norm": 0.233968585729599, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3114 + }, + { + "epoch": 2.4880191693290734, + "grad_norm": 0.04709520563483238, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3115 + }, + { + "epoch": 2.488817891373802, + "grad_norm": 0.16599629819393158, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3116 + }, + { + "epoch": 2.4896166134185305, + "grad_norm": 0.19273866713047028, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3117 + }, + { + "epoch": 2.4904153354632586, + "grad_norm": 0.11514598876237869, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3118 + }, + { + "epoch": 2.491214057507987, + "grad_norm": 0.08656881004571915, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 3119 + }, + { + "epoch": 2.4920127795527156, + "grad_norm": 0.18213899433612823, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3120 + }, + { + "epoch": 2.492811501597444, + "grad_norm": 0.11029175668954849, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3121 + }, + { + "epoch": 2.4936102236421727, + "grad_norm": 0.04480903223156929, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3122 + }, + { + "epoch": 2.494408945686901, + "grad_norm": 0.04919225722551346, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3123 + }, + { + "epoch": 2.4952076677316293, + "grad_norm": 0.06349056959152222, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3124 + }, + { + "epoch": 2.496006389776358, + "grad_norm": 0.04066464304924011, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3125 + }, + { + "epoch": 2.4968051118210863, + "grad_norm": 0.03992457687854767, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3126 + }, + { + "epoch": 2.497603833865815, + "grad_norm": 0.04580394923686981, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3127 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.13679265975952148, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3128 + }, + { + "epoch": 2.4992012779552715, + "grad_norm": 0.20708884298801422, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3129 + }, + { + "epoch": 2.5, + "grad_norm": 0.22991639375686646, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3130 + }, + { + "epoch": 2.5007987220447285, + "grad_norm": 0.15380895137786865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3131 + }, + { + "epoch": 2.501597444089457, + "grad_norm": 0.05112789571285248, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3132 + }, + { + "epoch": 2.502396166134185, + "grad_norm": 0.19797906279563904, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3133 + }, + { + "epoch": 2.5031948881789137, + "grad_norm": 0.18190141022205353, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3134 + }, + { + "epoch": 2.503993610223642, + "grad_norm": 0.04291468858718872, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3135 + }, + { + "epoch": 2.5047923322683707, + "grad_norm": 0.14576731622219086, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3136 + }, + { + "epoch": 2.505591054313099, + "grad_norm": 0.25093281269073486, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3137 + }, + { + "epoch": 2.5063897763578273, + "grad_norm": 0.22738556563854218, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3138 + }, + { + "epoch": 2.507188498402556, + "grad_norm": 0.08985915035009384, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3139 + }, + { + "epoch": 2.5079872204472844, + "grad_norm": 0.09632397443056107, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3140 + }, + { + "epoch": 2.508785942492013, + "grad_norm": 0.12138333916664124, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3141 + }, + { + "epoch": 2.5095846645367414, + "grad_norm": 0.04163306951522827, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3142 + }, + { + "epoch": 2.5103833865814695, + "grad_norm": 0.06187185272574425, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3143 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.09463546425104141, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3144 + }, + { + "epoch": 2.5119808306709266, + "grad_norm": 0.12386980652809143, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3145 + }, + { + "epoch": 2.512779552715655, + "grad_norm": 0.07090163975954056, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3146 + }, + { + "epoch": 2.513578274760383, + "grad_norm": 0.04502219334244728, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3147 + }, + { + "epoch": 2.5143769968051117, + "grad_norm": 0.08453603833913803, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3148 + }, + { + "epoch": 2.5151757188498403, + "grad_norm": 0.08686821907758713, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3149 + }, + { + "epoch": 2.515974440894569, + "grad_norm": 0.03968734294176102, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3150 + }, + { + "epoch": 2.5167731629392973, + "grad_norm": 0.08613990992307663, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3151 + }, + { + "epoch": 2.5175718849840254, + "grad_norm": 0.07950794696807861, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3152 + }, + { + "epoch": 2.518370607028754, + "grad_norm": 0.0449741929769516, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3153 + }, + { + "epoch": 2.5191693290734825, + "grad_norm": 0.09032034873962402, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3154 + }, + { + "epoch": 2.519968051118211, + "grad_norm": 0.06834430247545242, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3155 + }, + { + "epoch": 2.520766773162939, + "grad_norm": 0.13820379972457886, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3156 + }, + { + "epoch": 2.5215654952076676, + "grad_norm": 0.17753586173057556, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3157 + }, + { + "epoch": 2.522364217252396, + "grad_norm": 0.2663286626338959, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3158 + }, + { + "epoch": 2.5231629392971247, + "grad_norm": 0.21509577333927155, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3159 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.04614022746682167, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3160 + }, + { + "epoch": 2.5247603833865817, + "grad_norm": 0.13719527423381805, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3161 + }, + { + "epoch": 2.52555910543131, + "grad_norm": 0.20119087398052216, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3162 + }, + { + "epoch": 2.5263578274760383, + "grad_norm": 0.1822054237127304, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3163 + }, + { + "epoch": 2.527156549520767, + "grad_norm": 0.06550543755292892, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3164 + }, + { + "epoch": 2.527955271565495, + "grad_norm": 0.08079471439123154, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3165 + }, + { + "epoch": 2.5287539936102235, + "grad_norm": 0.10106988251209259, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3166 + }, + { + "epoch": 2.529552715654952, + "grad_norm": 0.06818784028291702, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3167 + }, + { + "epoch": 2.5303514376996805, + "grad_norm": 0.05976718291640282, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3168 + }, + { + "epoch": 2.531150159744409, + "grad_norm": 0.18163853883743286, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3169 + }, + { + "epoch": 2.5319488817891376, + "grad_norm": 0.26418858766555786, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3170 + }, + { + "epoch": 2.5327476038338657, + "grad_norm": 0.24044150114059448, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3171 + }, + { + "epoch": 2.533546325878594, + "grad_norm": 0.07499254494905472, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3172 + }, + { + "epoch": 2.5343450479233227, + "grad_norm": 0.17483314871788025, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3173 + }, + { + "epoch": 2.5351437699680512, + "grad_norm": 0.2698160707950592, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3174 + }, + { + "epoch": 2.5359424920127793, + "grad_norm": 0.2116270661354065, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3175 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.0545198880136013, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3176 + }, + { + "epoch": 2.5375399361022364, + "grad_norm": 0.1926649659872055, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3177 + }, + { + "epoch": 2.538338658146965, + "grad_norm": 0.24152790009975433, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3178 + }, + { + "epoch": 2.5391373801916934, + "grad_norm": 0.12380969524383545, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3179 + }, + { + "epoch": 2.539936102236422, + "grad_norm": 0.07934054732322693, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3180 + }, + { + "epoch": 2.54073482428115, + "grad_norm": 0.13688413798809052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3181 + }, + { + "epoch": 2.5415335463258786, + "grad_norm": 0.05832000821828842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3182 + }, + { + "epoch": 2.542332268370607, + "grad_norm": 0.08729993551969528, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3183 + }, + { + "epoch": 2.543130990415335, + "grad_norm": 0.16843630373477936, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3184 + }, + { + "epoch": 2.5439297124600637, + "grad_norm": 0.13045506179332733, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3185 + }, + { + "epoch": 2.5447284345047922, + "grad_norm": 0.038882140070199966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3186 + }, + { + "epoch": 2.5455271565495208, + "grad_norm": 0.14922545850276947, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3187 + }, + { + "epoch": 2.5463258785942493, + "grad_norm": 0.1961440145969391, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3188 + }, + { + "epoch": 2.547124600638978, + "grad_norm": 0.08585302531719208, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3189 + }, + { + "epoch": 2.547923322683706, + "grad_norm": 0.13141697645187378, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3190 + }, + { + "epoch": 2.5487220447284344, + "grad_norm": 0.20332233607769012, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3191 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.1740144044160843, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3192 + }, + { + "epoch": 2.5503194888178915, + "grad_norm": 0.04738207906484604, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3193 + }, + { + "epoch": 2.5511182108626196, + "grad_norm": 0.23204317688941956, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3194 + }, + { + "epoch": 2.551916932907348, + "grad_norm": 0.29033714532852173, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3195 + }, + { + "epoch": 2.5527156549520766, + "grad_norm": 0.1251334547996521, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3196 + }, + { + "epoch": 2.553514376996805, + "grad_norm": 0.1610727608203888, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3197 + }, + { + "epoch": 2.5543130990415337, + "grad_norm": 0.284105509519577, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3198 + }, + { + "epoch": 2.5551118210862622, + "grad_norm": 0.1530643254518509, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 3199 + }, + { + "epoch": 2.5559105431309903, + "grad_norm": 0.07761498540639877, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3200 + }, + { + "epoch": 2.556709265175719, + "grad_norm": 0.16693277657032013, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3201 + }, + { + "epoch": 2.5575079872204474, + "grad_norm": 0.06345608085393906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3202 + }, + { + "epoch": 2.5583067092651754, + "grad_norm": 0.10956210643053055, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3203 + }, + { + "epoch": 2.559105431309904, + "grad_norm": 0.17655007541179657, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3204 + }, + { + "epoch": 2.5599041533546325, + "grad_norm": 0.12615050375461578, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3205 + }, + { + "epoch": 2.560702875399361, + "grad_norm": 0.049671441316604614, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3206 + }, + { + "epoch": 2.5615015974440896, + "grad_norm": 0.16559815406799316, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3207 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 0.1279190182685852, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3208 + }, + { + "epoch": 2.563099041533546, + "grad_norm": 0.0540652722120285, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3209 + }, + { + "epoch": 2.5638977635782747, + "grad_norm": 0.1287074238061905, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 3210 + }, + { + "epoch": 2.5646964856230032, + "grad_norm": 0.1118067055940628, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3211 + }, + { + "epoch": 2.5654952076677318, + "grad_norm": 0.05159451439976692, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3212 + }, + { + "epoch": 2.56629392971246, + "grad_norm": 0.10654652118682861, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3213 + }, + { + "epoch": 2.5670926517571884, + "grad_norm": 0.15669982135295868, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3214 + }, + { + "epoch": 2.567891373801917, + "grad_norm": 0.11388157308101654, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3215 + }, + { + "epoch": 2.5686900958466454, + "grad_norm": 0.06434119492769241, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3216 + }, + { + "epoch": 2.569488817891374, + "grad_norm": 0.050070468336343765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3217 + }, + { + "epoch": 2.5702875399361025, + "grad_norm": 0.0522335022687912, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3218 + }, + { + "epoch": 2.5710862619808306, + "grad_norm": 0.04716494306921959, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3219 + }, + { + "epoch": 2.571884984025559, + "grad_norm": 0.03770711272954941, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3220 + }, + { + "epoch": 2.5726837060702876, + "grad_norm": 0.03955485299229622, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3221 + }, + { + "epoch": 2.5734824281150157, + "grad_norm": 0.03824841231107712, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3222 + }, + { + "epoch": 2.5742811501597442, + "grad_norm": 0.04722970351576805, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3223 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 0.05470758676528931, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3224 + }, + { + "epoch": 2.5758785942492013, + "grad_norm": 0.04934269189834595, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3225 + }, + { + "epoch": 2.57667731629393, + "grad_norm": 0.040627289563417435, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3226 + }, + { + "epoch": 2.5774760383386583, + "grad_norm": 0.05668056383728981, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3227 + }, + { + "epoch": 2.5782747603833864, + "grad_norm": 0.11724753677845001, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3228 + }, + { + "epoch": 2.579073482428115, + "grad_norm": 0.12204517424106598, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3229 + }, + { + "epoch": 2.5798722044728435, + "grad_norm": 0.10652083158493042, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3230 + }, + { + "epoch": 2.580670926517572, + "grad_norm": 0.07430299371480942, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3231 + }, + { + "epoch": 2.5814696485623, + "grad_norm": 0.03460770472884178, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3232 + }, + { + "epoch": 2.5822683706070286, + "grad_norm": 0.080150306224823, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3233 + }, + { + "epoch": 2.583067092651757, + "grad_norm": 0.1291198879480362, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3234 + }, + { + "epoch": 2.5838658146964857, + "grad_norm": 0.19541533291339874, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3235 + }, + { + "epoch": 2.584664536741214, + "grad_norm": 0.24089939892292023, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3236 + }, + { + "epoch": 2.5854632587859427, + "grad_norm": 0.1933099627494812, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3237 + }, + { + "epoch": 2.586261980830671, + "grad_norm": 0.07295489311218262, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3238 + }, + { + "epoch": 2.5870607028753994, + "grad_norm": 0.10686071962118149, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3239 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 0.17052637040615082, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3240 + }, + { + "epoch": 2.588658146964856, + "grad_norm": 0.12377535551786423, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3241 + }, + { + "epoch": 2.5894568690095845, + "grad_norm": 0.03730800375342369, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3242 + }, + { + "epoch": 2.590255591054313, + "grad_norm": 0.13848428428173065, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3243 + }, + { + "epoch": 2.5910543130990416, + "grad_norm": 0.18361017107963562, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3244 + }, + { + "epoch": 2.59185303514377, + "grad_norm": 0.11140795797109604, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3245 + }, + { + "epoch": 2.5926517571884986, + "grad_norm": 0.033891428261995316, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3246 + }, + { + "epoch": 2.5934504792332267, + "grad_norm": 0.13179628551006317, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3247 + }, + { + "epoch": 2.594249201277955, + "grad_norm": 0.19785374402999878, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3248 + }, + { + "epoch": 2.5950479233226837, + "grad_norm": 0.15991398692131042, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3249 + }, + { + "epoch": 2.5958466453674123, + "grad_norm": 0.0702645480632782, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3250 + }, + { + "epoch": 2.5966453674121404, + "grad_norm": 0.038220152258872986, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3251 + }, + { + "epoch": 2.597444089456869, + "grad_norm": 0.048042308539152145, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3252 + }, + { + "epoch": 2.5982428115015974, + "grad_norm": 0.05673132464289665, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3253 + }, + { + "epoch": 2.599041533546326, + "grad_norm": 0.057284750044345856, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3254 + }, + { + "epoch": 2.5998402555910545, + "grad_norm": 0.052904874086380005, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3255 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.04914860427379608, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3256 + }, + { + "epoch": 2.601437699680511, + "grad_norm": 0.08870472013950348, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3257 + }, + { + "epoch": 2.6022364217252396, + "grad_norm": 0.09863728284835815, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3258 + }, + { + "epoch": 2.603035143769968, + "grad_norm": 0.08116353303194046, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3259 + }, + { + "epoch": 2.6038338658146962, + "grad_norm": 0.043653007596731186, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3260 + }, + { + "epoch": 2.6046325878594248, + "grad_norm": 0.0579618401825428, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3261 + }, + { + "epoch": 2.6054313099041533, + "grad_norm": 0.08072935789823532, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3262 + }, + { + "epoch": 2.606230031948882, + "grad_norm": 0.05391686409711838, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3263 + }, + { + "epoch": 2.6070287539936103, + "grad_norm": 0.03471128270030022, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3264 + }, + { + "epoch": 2.607827476038339, + "grad_norm": 0.056328870356082916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3265 + }, + { + "epoch": 2.608626198083067, + "grad_norm": 0.05196002125740051, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3266 + }, + { + "epoch": 2.6094249201277955, + "grad_norm": 0.04338999465107918, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3267 + }, + { + "epoch": 2.610223642172524, + "grad_norm": 0.12365762889385223, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3268 + }, + { + "epoch": 2.6110223642172525, + "grad_norm": 0.19469699263572693, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3269 + }, + { + "epoch": 2.6118210862619806, + "grad_norm": 0.1825639009475708, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3270 + }, + { + "epoch": 2.612619808306709, + "grad_norm": 0.10235249251127243, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3271 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 0.05571124702692032, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3272 + }, + { + "epoch": 2.614217252396166, + "grad_norm": 0.1536952704191208, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3273 + }, + { + "epoch": 2.6150159744408947, + "grad_norm": 0.163212850689888, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3274 + }, + { + "epoch": 2.6158146964856233, + "grad_norm": 0.09640593826770782, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3275 + }, + { + "epoch": 2.6166134185303513, + "grad_norm": 0.04329126700758934, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3276 + }, + { + "epoch": 2.61741214057508, + "grad_norm": 0.03598733991384506, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3277 + }, + { + "epoch": 2.6182108626198084, + "grad_norm": 0.046664439141750336, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3278 + }, + { + "epoch": 2.6190095846645365, + "grad_norm": 0.03692904859781265, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3279 + }, + { + "epoch": 2.619808306709265, + "grad_norm": 0.0482964888215065, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3280 + }, + { + "epoch": 2.6206070287539935, + "grad_norm": 0.07996834069490433, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3281 + }, + { + "epoch": 2.621405750798722, + "grad_norm": 0.060141101479530334, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3282 + }, + { + "epoch": 2.6222044728434506, + "grad_norm": 0.04013051837682724, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 3283 + }, + { + "epoch": 2.623003194888179, + "grad_norm": 0.04011296480894089, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3284 + }, + { + "epoch": 2.623801916932907, + "grad_norm": 0.04112064838409424, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3285 + }, + { + "epoch": 2.6246006389776357, + "grad_norm": 0.057281915098428726, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3286 + }, + { + "epoch": 2.6253993610223643, + "grad_norm": 0.06061771139502525, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3287 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 0.05844549089670181, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3288 + }, + { + "epoch": 2.626996805111821, + "grad_norm": 0.06354600191116333, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3289 + }, + { + "epoch": 2.6277955271565494, + "grad_norm": 0.04568248987197876, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3290 + }, + { + "epoch": 2.628594249201278, + "grad_norm": 0.04340318217873573, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3291 + }, + { + "epoch": 2.6293929712460065, + "grad_norm": 0.07078617066144943, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3292 + }, + { + "epoch": 2.630191693290735, + "grad_norm": 0.09865503013134003, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3293 + }, + { + "epoch": 2.6309904153354635, + "grad_norm": 0.08623871207237244, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3294 + }, + { + "epoch": 2.6317891373801916, + "grad_norm": 0.03787717968225479, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3295 + }, + { + "epoch": 2.63258785942492, + "grad_norm": 0.14653000235557556, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3296 + }, + { + "epoch": 2.6333865814696487, + "grad_norm": 0.2749452292919159, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3297 + }, + { + "epoch": 2.6341853035143767, + "grad_norm": 0.28424543142318726, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3298 + }, + { + "epoch": 2.6349840255591053, + "grad_norm": 0.17354224622249603, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3299 + }, + { + "epoch": 2.635782747603834, + "grad_norm": 0.04208464175462723, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3300 + }, + { + "epoch": 2.6365814696485623, + "grad_norm": 0.15522420406341553, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3301 + }, + { + "epoch": 2.637380191693291, + "grad_norm": 0.17986370623111725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3302 + }, + { + "epoch": 2.6381789137380194, + "grad_norm": 0.07155515998601913, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3303 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.11287503689527512, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3304 + }, + { + "epoch": 2.639776357827476, + "grad_norm": 0.22735139727592468, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3305 + }, + { + "epoch": 2.6405750798722045, + "grad_norm": 0.23528814315795898, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3306 + }, + { + "epoch": 2.641373801916933, + "grad_norm": 0.13828198611736298, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3307 + }, + { + "epoch": 2.642172523961661, + "grad_norm": 0.046783462166786194, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3308 + }, + { + "epoch": 2.6429712460063897, + "grad_norm": 0.13010001182556152, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3309 + }, + { + "epoch": 2.643769968051118, + "grad_norm": 0.12339942902326584, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3310 + }, + { + "epoch": 2.6445686900958467, + "grad_norm": 0.06443019211292267, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3311 + }, + { + "epoch": 2.6453674121405752, + "grad_norm": 0.05086766183376312, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3312 + }, + { + "epoch": 2.6461661341853038, + "grad_norm": 0.1266956627368927, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3313 + }, + { + "epoch": 2.646964856230032, + "grad_norm": 0.1238899901509285, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3314 + }, + { + "epoch": 2.6477635782747604, + "grad_norm": 0.07378736138343811, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3315 + }, + { + "epoch": 2.648562300319489, + "grad_norm": 0.12572194635868073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3316 + }, + { + "epoch": 2.649361022364217, + "grad_norm": 0.18099260330200195, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3317 + }, + { + "epoch": 2.6501597444089455, + "grad_norm": 0.1383541077375412, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3318 + }, + { + "epoch": 2.650958466453674, + "grad_norm": 0.043900374323129654, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3319 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 0.13228318095207214, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3320 + }, + { + "epoch": 2.652555910543131, + "grad_norm": 0.11684399843215942, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3321 + }, + { + "epoch": 2.6533546325878596, + "grad_norm": 0.03879965469241142, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3322 + }, + { + "epoch": 2.6541533546325877, + "grad_norm": 0.1457953006029129, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3323 + }, + { + "epoch": 2.6549520766773163, + "grad_norm": 0.21643802523612976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3324 + }, + { + "epoch": 2.655750798722045, + "grad_norm": 0.20250067114830017, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3325 + }, + { + "epoch": 2.6565495207667733, + "grad_norm": 0.09131773561239243, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3326 + }, + { + "epoch": 2.6573482428115014, + "grad_norm": 0.07217761129140854, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3327 + }, + { + "epoch": 2.65814696485623, + "grad_norm": 0.13251517713069916, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3328 + }, + { + "epoch": 2.6589456869009584, + "grad_norm": 0.09462655335664749, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3329 + }, + { + "epoch": 2.659744408945687, + "grad_norm": 0.04496161639690399, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3330 + }, + { + "epoch": 2.6605431309904155, + "grad_norm": 0.13246162235736847, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3331 + }, + { + "epoch": 2.661341853035144, + "grad_norm": 0.1548391878604889, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3332 + }, + { + "epoch": 2.662140575079872, + "grad_norm": 0.09438800066709518, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3333 + }, + { + "epoch": 2.6629392971246006, + "grad_norm": 0.033411599695682526, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3334 + }, + { + "epoch": 2.663738019169329, + "grad_norm": 0.04015564173460007, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3335 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 0.033046361058950424, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3336 + }, + { + "epoch": 2.665335463258786, + "grad_norm": 0.04766019433736801, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3337 + }, + { + "epoch": 2.6661341853035143, + "grad_norm": 0.06365641951560974, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3338 + }, + { + "epoch": 2.666932907348243, + "grad_norm": 0.03329809010028839, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3339 + }, + { + "epoch": 2.6677316293929714, + "grad_norm": 0.10063061863183975, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3340 + }, + { + "epoch": 2.6685303514377, + "grad_norm": 0.16541579365730286, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 3341 + }, + { + "epoch": 2.669329073482428, + "grad_norm": 0.18877379596233368, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3342 + }, + { + "epoch": 2.6701277955271565, + "grad_norm": 0.12577234208583832, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3343 + }, + { + "epoch": 2.670926517571885, + "grad_norm": 0.04403039440512657, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3344 + }, + { + "epoch": 2.6717252396166136, + "grad_norm": 0.172403946518898, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3345 + }, + { + "epoch": 2.6725239616613417, + "grad_norm": 0.2147791087627411, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3346 + }, + { + "epoch": 2.67332268370607, + "grad_norm": 0.1536005735397339, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3347 + }, + { + "epoch": 2.6741214057507987, + "grad_norm": 0.061038631945848465, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3348 + }, + { + "epoch": 2.6749201277955272, + "grad_norm": 0.03402748703956604, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3349 + }, + { + "epoch": 2.6757188498402558, + "grad_norm": 0.05285736918449402, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3350 + }, + { + "epoch": 2.6765175718849843, + "grad_norm": 0.0807662233710289, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3351 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.057097889482975006, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3352 + }, + { + "epoch": 2.678115015974441, + "grad_norm": 0.06845760345458984, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3353 + }, + { + "epoch": 2.6789137380191694, + "grad_norm": 0.1209796816110611, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3354 + }, + { + "epoch": 2.6797124600638975, + "grad_norm": 0.09372428804636002, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3355 + }, + { + "epoch": 2.680511182108626, + "grad_norm": 0.03795485943555832, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3356 + }, + { + "epoch": 2.6813099041533546, + "grad_norm": 0.14420334994792938, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3357 + }, + { + "epoch": 2.682108626198083, + "grad_norm": 0.23049019277095795, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3358 + }, + { + "epoch": 2.6829073482428116, + "grad_norm": 0.21722057461738586, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3359 + }, + { + "epoch": 2.68370607028754, + "grad_norm": 0.0968366488814354, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3360 + }, + { + "epoch": 2.6845047923322682, + "grad_norm": 0.10279416292905807, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3361 + }, + { + "epoch": 2.6853035143769968, + "grad_norm": 0.2077404409646988, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3362 + }, + { + "epoch": 2.6861022364217253, + "grad_norm": 0.14186711609363556, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3363 + }, + { + "epoch": 2.686900958466454, + "grad_norm": 0.04573604837059975, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3364 + }, + { + "epoch": 2.687699680511182, + "grad_norm": 0.13861627876758575, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3365 + }, + { + "epoch": 2.6884984025559104, + "grad_norm": 0.17746120691299438, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3366 + }, + { + "epoch": 2.689297124600639, + "grad_norm": 0.15865683555603027, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3367 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.05537402629852295, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3368 + }, + { + "epoch": 2.690894568690096, + "grad_norm": 0.064423106610775, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3369 + }, + { + "epoch": 2.6916932907348246, + "grad_norm": 0.0922585278749466, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3370 + }, + { + "epoch": 2.6924920127795526, + "grad_norm": 0.08034171909093857, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3371 + }, + { + "epoch": 2.693290734824281, + "grad_norm": 0.05695292726159096, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3372 + }, + { + "epoch": 2.6940894568690097, + "grad_norm": 0.04140406847000122, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3373 + }, + { + "epoch": 2.6948881789137378, + "grad_norm": 0.038130711764097214, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3374 + }, + { + "epoch": 2.6956869009584663, + "grad_norm": 0.07363594323396683, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3375 + }, + { + "epoch": 2.696485623003195, + "grad_norm": 0.13670513033866882, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3376 + }, + { + "epoch": 2.6972843450479234, + "grad_norm": 0.16614536941051483, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3377 + }, + { + "epoch": 2.698083067092652, + "grad_norm": 0.1346762478351593, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3378 + }, + { + "epoch": 2.6988817891373804, + "grad_norm": 0.06321856379508972, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3379 + }, + { + "epoch": 2.6996805111821085, + "grad_norm": 0.057517897337675095, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3380 + }, + { + "epoch": 2.700479233226837, + "grad_norm": 0.11995001137256622, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3381 + }, + { + "epoch": 2.7012779552715656, + "grad_norm": 0.10514877736568451, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3382 + }, + { + "epoch": 2.702076677316294, + "grad_norm": 0.05942686274647713, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3383 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 0.03508206829428673, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3384 + }, + { + "epoch": 2.7036741214057507, + "grad_norm": 0.05182692036032677, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3385 + }, + { + "epoch": 2.7044728434504792, + "grad_norm": 0.0597345344722271, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3386 + }, + { + "epoch": 2.7052715654952078, + "grad_norm": 0.037486087530851364, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3387 + }, + { + "epoch": 2.7060702875399363, + "grad_norm": 0.040483538061380386, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 3388 + }, + { + "epoch": 2.706869009584665, + "grad_norm": 0.044094670563936234, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3389 + }, + { + "epoch": 2.707667731629393, + "grad_norm": 0.06498228758573532, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3390 + }, + { + "epoch": 2.7084664536741214, + "grad_norm": 0.06955298781394958, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3391 + }, + { + "epoch": 2.70926517571885, + "grad_norm": 0.11691966652870178, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3392 + }, + { + "epoch": 2.710063897763578, + "grad_norm": 0.1183234304189682, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3393 + }, + { + "epoch": 2.7108626198083066, + "grad_norm": 0.08358792215585709, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3394 + }, + { + "epoch": 2.711661341853035, + "grad_norm": 0.04190056398510933, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3395 + }, + { + "epoch": 2.7124600638977636, + "grad_norm": 0.09757649153470993, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3396 + }, + { + "epoch": 2.713258785942492, + "grad_norm": 0.11508934944868088, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 3397 + }, + { + "epoch": 2.7140575079872207, + "grad_norm": 0.05612087994813919, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3398 + }, + { + "epoch": 2.7148562300319488, + "grad_norm": 0.07044408470392227, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3399 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.07732822746038437, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3400 + }, + { + "epoch": 2.716453674121406, + "grad_norm": 0.054326847195625305, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3401 + }, + { + "epoch": 2.7172523961661343, + "grad_norm": 0.041327398270368576, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3402 + }, + { + "epoch": 2.7180511182108624, + "grad_norm": 0.07147548347711563, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3403 + }, + { + "epoch": 2.718849840255591, + "grad_norm": 0.12999942898750305, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3404 + }, + { + "epoch": 2.7196485623003195, + "grad_norm": 0.18404515087604523, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3405 + }, + { + "epoch": 2.720447284345048, + "grad_norm": 0.1873377114534378, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3406 + }, + { + "epoch": 2.7212460063897765, + "grad_norm": 0.0732024610042572, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3407 + }, + { + "epoch": 2.722044728434505, + "grad_norm": 0.07602795958518982, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3408 + }, + { + "epoch": 2.722843450479233, + "grad_norm": 0.07871323823928833, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3409 + }, + { + "epoch": 2.7236421725239617, + "grad_norm": 0.0738302692770958, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3410 + }, + { + "epoch": 2.72444089456869, + "grad_norm": 0.12097286432981491, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3411 + }, + { + "epoch": 2.7252396166134183, + "grad_norm": 0.10136821120977402, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3412 + }, + { + "epoch": 2.726038338658147, + "grad_norm": 0.07281512022018433, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3413 + }, + { + "epoch": 2.7268370607028753, + "grad_norm": 0.09425969421863556, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3414 + }, + { + "epoch": 2.727635782747604, + "grad_norm": 0.11939436942338943, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3415 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 0.07181181758642197, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3416 + }, + { + "epoch": 2.729233226837061, + "grad_norm": 0.06634730845689774, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3417 + }, + { + "epoch": 2.730031948881789, + "grad_norm": 0.0941692590713501, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3418 + }, + { + "epoch": 2.7308306709265175, + "grad_norm": 0.10803452879190445, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3419 + }, + { + "epoch": 2.731629392971246, + "grad_norm": 0.08289305865764618, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3420 + }, + { + "epoch": 2.7324281150159746, + "grad_norm": 0.048421960324048996, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3421 + }, + { + "epoch": 2.7332268370607027, + "grad_norm": 0.09108635783195496, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3422 + }, + { + "epoch": 2.734025559105431, + "grad_norm": 0.13627508282661438, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3423 + }, + { + "epoch": 2.7348242811501597, + "grad_norm": 0.14651858806610107, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3424 + }, + { + "epoch": 2.7356230031948883, + "grad_norm": 0.126741424202919, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3425 + }, + { + "epoch": 2.736421725239617, + "grad_norm": 0.05885545164346695, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3426 + }, + { + "epoch": 2.737220447284345, + "grad_norm": 0.09471739828586578, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3427 + }, + { + "epoch": 2.7380191693290734, + "grad_norm": 0.18026123940944672, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3428 + }, + { + "epoch": 2.738817891373802, + "grad_norm": 0.1737871915102005, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3429 + }, + { + "epoch": 2.7396166134185305, + "grad_norm": 0.052994512021541595, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3430 + }, + { + "epoch": 2.7404153354632586, + "grad_norm": 0.13484452664852142, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 3431 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 0.2207227200269699, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3432 + }, + { + "epoch": 2.7420127795527156, + "grad_norm": 0.17741963267326355, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3433 + }, + { + "epoch": 2.742811501597444, + "grad_norm": 0.07451824843883514, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3434 + }, + { + "epoch": 2.7436102236421727, + "grad_norm": 0.07947403192520142, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3435 + }, + { + "epoch": 2.744408945686901, + "grad_norm": 0.11197762936353683, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3436 + }, + { + "epoch": 2.7452076677316293, + "grad_norm": 0.08398377895355225, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 3437 + }, + { + "epoch": 2.746006389776358, + "grad_norm": 0.03809420019388199, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3438 + }, + { + "epoch": 2.7468051118210863, + "grad_norm": 0.11537694931030273, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3439 + }, + { + "epoch": 2.747603833865815, + "grad_norm": 0.1537221372127533, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3440 + }, + { + "epoch": 2.748402555910543, + "grad_norm": 0.1132403165102005, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3441 + }, + { + "epoch": 2.7492012779552715, + "grad_norm": 0.038440920412540436, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3442 + }, + { + "epoch": 2.75, + "grad_norm": 0.10132595151662827, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3443 + }, + { + "epoch": 2.7507987220447285, + "grad_norm": 0.12446253001689911, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3444 + }, + { + "epoch": 2.751597444089457, + "grad_norm": 0.05364474281668663, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3445 + }, + { + "epoch": 2.752396166134185, + "grad_norm": 0.04705234244465828, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3446 + }, + { + "epoch": 2.7531948881789137, + "grad_norm": 0.10524975508451462, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3447 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 0.12036000937223434, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3448 + }, + { + "epoch": 2.7547923322683707, + "grad_norm": 0.08042819797992706, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3449 + }, + { + "epoch": 2.755591054313099, + "grad_norm": 0.04404102638363838, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3450 + }, + { + "epoch": 2.7563897763578273, + "grad_norm": 0.0766257792711258, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3451 + }, + { + "epoch": 2.757188498402556, + "grad_norm": 0.06359248608350754, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3452 + }, + { + "epoch": 2.7579872204472844, + "grad_norm": 0.06752901524305344, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3453 + }, + { + "epoch": 2.758785942492013, + "grad_norm": 0.12018375843763351, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3454 + }, + { + "epoch": 2.7595846645367414, + "grad_norm": 0.15904727578163147, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3455 + }, + { + "epoch": 2.7603833865814695, + "grad_norm": 0.12665021419525146, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3456 + }, + { + "epoch": 2.761182108626198, + "grad_norm": 0.07552342861890793, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3457 + }, + { + "epoch": 2.7619808306709266, + "grad_norm": 0.25927653908729553, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3458 + }, + { + "epoch": 2.762779552715655, + "grad_norm": 0.3487590253353119, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3459 + }, + { + "epoch": 2.763578274760383, + "grad_norm": 0.2783665359020233, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3460 + }, + { + "epoch": 2.7643769968051117, + "grad_norm": 0.054424334317445755, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3461 + }, + { + "epoch": 2.7651757188498403, + "grad_norm": 0.240921288728714, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3462 + }, + { + "epoch": 2.765974440894569, + "grad_norm": 0.3380962014198303, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3463 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.1514623463153839, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3464 + }, + { + "epoch": 2.7675718849840254, + "grad_norm": 0.15135464072227478, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3465 + }, + { + "epoch": 2.768370607028754, + "grad_norm": 0.262546181678772, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3466 + }, + { + "epoch": 2.7691693290734825, + "grad_norm": 0.11052273958921432, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3467 + }, + { + "epoch": 2.769968051118211, + "grad_norm": 0.14473804831504822, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3468 + }, + { + "epoch": 2.770766773162939, + "grad_norm": 0.24968142807483673, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3469 + }, + { + "epoch": 2.7715654952076676, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3470 + }, + { + "epoch": 2.772364217252396, + "grad_norm": 0.0957072302699089, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3471 + }, + { + "epoch": 2.7731629392971247, + "grad_norm": 0.2122000902891159, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3472 + }, + { + "epoch": 2.773961661341853, + "grad_norm": 0.15716226398944855, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3473 + }, + { + "epoch": 2.7747603833865817, + "grad_norm": 0.05107169970870018, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3474 + }, + { + "epoch": 2.77555910543131, + "grad_norm": 0.19824674725532532, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3475 + }, + { + "epoch": 2.7763578274760383, + "grad_norm": 0.16866235435009003, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3476 + }, + { + "epoch": 2.777156549520767, + "grad_norm": 0.03332412987947464, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3477 + }, + { + "epoch": 2.777955271565495, + "grad_norm": 0.1771237850189209, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3478 + }, + { + "epoch": 2.7787539936102235, + "grad_norm": 0.23501509428024292, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3479 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.0976579561829567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3480 + }, + { + "epoch": 2.7803514376996805, + "grad_norm": 0.11640458554029465, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3481 + }, + { + "epoch": 2.781150159744409, + "grad_norm": 0.2140960842370987, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3482 + }, + { + "epoch": 2.7819488817891376, + "grad_norm": 0.2055736929178238, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3483 + }, + { + "epoch": 2.7827476038338657, + "grad_norm": 0.09386937320232391, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 3484 + }, + { + "epoch": 2.783546325878594, + "grad_norm": 0.11534380912780762, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3485 + }, + { + "epoch": 2.7843450479233227, + "grad_norm": 0.19186711311340332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3486 + }, + { + "epoch": 2.7851437699680512, + "grad_norm": 0.26858124136924744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3487 + }, + { + "epoch": 2.7859424920127793, + "grad_norm": 0.05965370684862137, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 3488 + }, + { + "epoch": 2.786741214057508, + "grad_norm": 0.17804528772830963, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3489 + }, + { + "epoch": 2.7875399361022364, + "grad_norm": 0.1802065223455429, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 3490 + }, + { + "epoch": 2.788338658146965, + "grad_norm": 0.06634502857923508, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3491 + }, + { + "epoch": 2.7891373801916934, + "grad_norm": 0.06682102382183075, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3492 + }, + { + "epoch": 2.789936102236422, + "grad_norm": 0.08941584080457687, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3493 + }, + { + "epoch": 2.79073482428115, + "grad_norm": 0.06336037069559097, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3494 + }, + { + "epoch": 2.7915335463258786, + "grad_norm": 0.05562690272927284, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3495 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.10294149816036224, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3496 + }, + { + "epoch": 2.793130990415335, + "grad_norm": 0.11363442987203598, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3497 + }, + { + "epoch": 2.7939297124600637, + "grad_norm": 0.05790446698665619, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3498 + }, + { + "epoch": 2.7947284345047922, + "grad_norm": 0.09351370483636856, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3499 + }, + { + "epoch": 2.7955271565495208, + "grad_norm": 0.2225412130355835, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3500 + }, + { + "epoch": 2.7963258785942493, + "grad_norm": 0.21828165650367737, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3501 + }, + { + "epoch": 2.797124600638978, + "grad_norm": 0.06987733393907547, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 3502 + }, + { + "epoch": 2.797923322683706, + "grad_norm": 0.14518103003501892, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3503 + }, + { + "epoch": 2.7987220447284344, + "grad_norm": 0.24233761429786682, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3504 + }, + { + "epoch": 2.799520766773163, + "grad_norm": 0.19286365807056427, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3505 + }, + { + "epoch": 2.8003194888178915, + "grad_norm": 0.07981286942958832, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3506 + }, + { + "epoch": 2.8011182108626196, + "grad_norm": 0.050319187343120575, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3507 + }, + { + "epoch": 2.801916932907348, + "grad_norm": 0.09955406934022903, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3508 + }, + { + "epoch": 2.8027156549520766, + "grad_norm": 0.048427898436784744, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3509 + }, + { + "epoch": 2.803514376996805, + "grad_norm": 0.0805777907371521, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3510 + }, + { + "epoch": 2.8043130990415337, + "grad_norm": 0.07289621978998184, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3511 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.04940955713391304, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3512 + }, + { + "epoch": 2.8059105431309903, + "grad_norm": 0.07228294759988785, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3513 + }, + { + "epoch": 2.806709265175719, + "grad_norm": 0.06902103871107101, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3514 + }, + { + "epoch": 2.8075079872204474, + "grad_norm": 0.056301236152648926, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3515 + }, + { + "epoch": 2.8083067092651754, + "grad_norm": 0.03880859166383743, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3516 + }, + { + "epoch": 2.809105431309904, + "grad_norm": 0.04914811998605728, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3517 + }, + { + "epoch": 2.8099041533546325, + "grad_norm": 0.04139270633459091, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3518 + }, + { + "epoch": 2.810702875399361, + "grad_norm": 0.05118592828512192, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3519 + }, + { + "epoch": 2.8115015974440896, + "grad_norm": 0.03548616170883179, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 3520 + }, + { + "epoch": 2.812300319488818, + "grad_norm": 0.04883241280913353, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3521 + }, + { + "epoch": 2.813099041533546, + "grad_norm": 0.044492170214653015, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3522 + }, + { + "epoch": 2.8138977635782747, + "grad_norm": 0.050978366285562515, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3523 + }, + { + "epoch": 2.8146964856230032, + "grad_norm": 0.04663826525211334, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3524 + }, + { + "epoch": 2.8154952076677318, + "grad_norm": 0.06378154456615448, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3525 + }, + { + "epoch": 2.81629392971246, + "grad_norm": 0.06913618743419647, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3526 + }, + { + "epoch": 2.8170926517571884, + "grad_norm": 0.084662064909935, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3527 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 0.08352439105510712, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3528 + }, + { + "epoch": 2.8186900958466454, + "grad_norm": 0.07254189252853394, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3529 + }, + { + "epoch": 2.819488817891374, + "grad_norm": 0.04416285827755928, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 3530 + }, + { + "epoch": 2.8202875399361025, + "grad_norm": 0.056230951100587845, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3531 + }, + { + "epoch": 2.8210862619808306, + "grad_norm": 0.11055732518434525, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3532 + }, + { + "epoch": 2.821884984025559, + "grad_norm": 0.08660246431827545, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3533 + }, + { + "epoch": 2.8226837060702876, + "grad_norm": 0.0691947191953659, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3534 + }, + { + "epoch": 2.8234824281150157, + "grad_norm": 0.09254545718431473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3535 + }, + { + "epoch": 2.8242811501597442, + "grad_norm": 0.0663340613245964, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3536 + }, + { + "epoch": 2.8250798722044728, + "grad_norm": 0.05052514374256134, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3537 + }, + { + "epoch": 2.8258785942492013, + "grad_norm": 0.08364969491958618, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3538 + }, + { + "epoch": 2.82667731629393, + "grad_norm": 0.08269570767879486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3539 + }, + { + "epoch": 2.8274760383386583, + "grad_norm": 0.06289245933294296, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3540 + }, + { + "epoch": 2.8282747603833864, + "grad_norm": 0.03565627336502075, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3541 + }, + { + "epoch": 2.829073482428115, + "grad_norm": 0.057896651327610016, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3542 + }, + { + "epoch": 2.8298722044728435, + "grad_norm": 0.046379514038562775, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3543 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.06231336295604706, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3544 + }, + { + "epoch": 2.8314696485623, + "grad_norm": 0.03983502462506294, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3545 + }, + { + "epoch": 2.8322683706070286, + "grad_norm": 0.07364759594202042, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3546 + }, + { + "epoch": 2.833067092651757, + "grad_norm": 0.11596816778182983, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3547 + }, + { + "epoch": 2.8338658146964857, + "grad_norm": 0.10731378942728043, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3548 + }, + { + "epoch": 2.834664536741214, + "grad_norm": 0.06365050375461578, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3549 + }, + { + "epoch": 2.8354632587859427, + "grad_norm": 0.055451441556215286, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3550 + }, + { + "epoch": 2.836261980830671, + "grad_norm": 0.1490558534860611, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3551 + }, + { + "epoch": 2.8370607028753994, + "grad_norm": 0.1539796143770218, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3552 + }, + { + "epoch": 2.837859424920128, + "grad_norm": 0.06760501861572266, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3553 + }, + { + "epoch": 2.838658146964856, + "grad_norm": 0.0685611367225647, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 3554 + }, + { + "epoch": 2.8394568690095845, + "grad_norm": 0.14234358072280884, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3555 + }, + { + "epoch": 2.840255591054313, + "grad_norm": 0.14428865909576416, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3556 + }, + { + "epoch": 2.8410543130990416, + "grad_norm": 0.07594695687294006, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3557 + }, + { + "epoch": 2.84185303514377, + "grad_norm": 0.040841538459062576, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3558 + }, + { + "epoch": 2.8426517571884986, + "grad_norm": 0.04991824924945831, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3559 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 0.03846943378448486, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3560 + }, + { + "epoch": 2.844249201277955, + "grad_norm": 0.04851507395505905, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3561 + }, + { + "epoch": 2.8450479233226837, + "grad_norm": 0.0635538399219513, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3562 + }, + { + "epoch": 2.8458466453674123, + "grad_norm": 0.11812663078308105, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3563 + }, + { + "epoch": 2.8466453674121404, + "grad_norm": 0.05664098262786865, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3564 + }, + { + "epoch": 2.847444089456869, + "grad_norm": 0.03532585874199867, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3565 + }, + { + "epoch": 2.8482428115015974, + "grad_norm": 0.06758403033018112, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 3566 + }, + { + "epoch": 2.849041533546326, + "grad_norm": 0.06279300898313522, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3567 + }, + { + "epoch": 2.8498402555910545, + "grad_norm": 0.043967198580503464, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3568 + }, + { + "epoch": 2.850638977635783, + "grad_norm": 0.04900701716542244, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 3569 + }, + { + "epoch": 2.851437699680511, + "grad_norm": 0.07339311391115189, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3570 + }, + { + "epoch": 2.8522364217252396, + "grad_norm": 0.10644743591547012, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3571 + }, + { + "epoch": 2.853035143769968, + "grad_norm": 0.10544353723526001, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3572 + }, + { + "epoch": 2.8538338658146962, + "grad_norm": 0.0590951181948185, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3573 + }, + { + "epoch": 2.8546325878594248, + "grad_norm": 0.05038939788937569, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3574 + }, + { + "epoch": 2.8554313099041533, + "grad_norm": 0.06013040617108345, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3575 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.07330521196126938, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3576 + }, + { + "epoch": 2.8570287539936103, + "grad_norm": 0.12049853056669235, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3577 + }, + { + "epoch": 2.857827476038339, + "grad_norm": 0.13056780397891998, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3578 + }, + { + "epoch": 2.858626198083067, + "grad_norm": 0.12987029552459717, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3579 + }, + { + "epoch": 2.8594249201277955, + "grad_norm": 0.08681001514196396, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3580 + }, + { + "epoch": 2.860223642172524, + "grad_norm": 0.060947105288505554, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3581 + }, + { + "epoch": 2.8610223642172525, + "grad_norm": 0.10896368324756622, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3582 + }, + { + "epoch": 2.8618210862619806, + "grad_norm": 0.1251460760831833, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3583 + }, + { + "epoch": 2.862619808306709, + "grad_norm": 0.035174671560525894, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3584 + }, + { + "epoch": 2.8634185303514377, + "grad_norm": 0.12026303261518478, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3585 + }, + { + "epoch": 2.864217252396166, + "grad_norm": 0.16679063439369202, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3586 + }, + { + "epoch": 2.8650159744408947, + "grad_norm": 0.19229409098625183, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3587 + }, + { + "epoch": 2.8658146964856233, + "grad_norm": 0.17964699864387512, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3588 + }, + { + "epoch": 2.8666134185303513, + "grad_norm": 0.10671430081129074, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3589 + }, + { + "epoch": 2.86741214057508, + "grad_norm": 0.04453161358833313, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 3590 + }, + { + "epoch": 2.8682108626198084, + "grad_norm": 0.1531655639410019, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3591 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.19321779906749725, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3592 + }, + { + "epoch": 2.869808306709265, + "grad_norm": 0.19540782272815704, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3593 + }, + { + "epoch": 2.8706070287539935, + "grad_norm": 0.22210878133773804, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3594 + }, + { + "epoch": 2.871405750798722, + "grad_norm": 0.2089247703552246, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3595 + }, + { + "epoch": 2.8722044728434506, + "grad_norm": 0.11910446733236313, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3596 + }, + { + "epoch": 2.873003194888179, + "grad_norm": 0.05230247974395752, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3597 + }, + { + "epoch": 2.873801916932907, + "grad_norm": 0.09492263197898865, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3598 + }, + { + "epoch": 2.8746006389776357, + "grad_norm": 0.1396690160036087, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3599 + }, + { + "epoch": 2.8753993610223643, + "grad_norm": 0.12218718230724335, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3600 + }, + { + "epoch": 2.876198083067093, + "grad_norm": 0.05510007217526436, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 3601 + }, + { + "epoch": 2.876996805111821, + "grad_norm": 0.04949348792433739, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3602 + }, + { + "epoch": 2.8777955271565494, + "grad_norm": 0.06522537767887115, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3603 + }, + { + "epoch": 2.878594249201278, + "grad_norm": 0.034176018089056015, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3604 + }, + { + "epoch": 2.8793929712460065, + "grad_norm": 0.07579770684242249, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3605 + }, + { + "epoch": 2.880191693290735, + "grad_norm": 0.09512948244810104, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3606 + }, + { + "epoch": 2.8809904153354635, + "grad_norm": 0.059753213077783585, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3607 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.2461470365524292, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3608 + }, + { + "epoch": 2.88258785942492, + "grad_norm": 0.11298660188913345, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3609 + }, + { + "epoch": 2.8833865814696487, + "grad_norm": 0.20638997852802277, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3610 + }, + { + "epoch": 2.8841853035143767, + "grad_norm": 0.2394232600927353, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3611 + }, + { + "epoch": 2.8849840255591053, + "grad_norm": 0.15168963372707367, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3612 + }, + { + "epoch": 2.885782747603834, + "grad_norm": 0.03990825638175011, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3613 + }, + { + "epoch": 2.8865814696485623, + "grad_norm": 0.1725347936153412, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3614 + }, + { + "epoch": 2.887380191693291, + "grad_norm": 0.20821869373321533, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3615 + }, + { + "epoch": 2.8881789137380194, + "grad_norm": 0.14441269636154175, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3616 + }, + { + "epoch": 2.8889776357827475, + "grad_norm": 0.037162624299526215, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3617 + }, + { + "epoch": 2.889776357827476, + "grad_norm": 0.11550657451152802, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3618 + }, + { + "epoch": 2.8905750798722045, + "grad_norm": 0.15214277803897858, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3619 + }, + { + "epoch": 2.891373801916933, + "grad_norm": 0.09059946238994598, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3620 + }, + { + "epoch": 2.892172523961661, + "grad_norm": 0.03436599299311638, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3621 + }, + { + "epoch": 2.8929712460063897, + "grad_norm": 0.0839625746011734, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3622 + }, + { + "epoch": 2.893769968051118, + "grad_norm": 0.1618664264678955, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3623 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.08216597139835358, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3624 + }, + { + "epoch": 2.8953674121405752, + "grad_norm": 0.06303965300321579, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3625 + }, + { + "epoch": 2.8961661341853038, + "grad_norm": 0.050278183072805405, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3626 + }, + { + "epoch": 2.896964856230032, + "grad_norm": 0.04620242863893509, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3627 + }, + { + "epoch": 2.8977635782747604, + "grad_norm": 0.04937691614031792, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3628 + }, + { + "epoch": 2.898562300319489, + "grad_norm": 0.056928347796201706, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3629 + }, + { + "epoch": 2.899361022364217, + "grad_norm": 0.04932256042957306, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3630 + }, + { + "epoch": 2.9001597444089455, + "grad_norm": 0.04320303350687027, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3631 + }, + { + "epoch": 2.900958466453674, + "grad_norm": 0.08589868247509003, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3632 + }, + { + "epoch": 2.9017571884984026, + "grad_norm": 0.11458484083414078, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3633 + }, + { + "epoch": 2.902555910543131, + "grad_norm": 0.13549752533435822, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3634 + }, + { + "epoch": 2.9033546325878596, + "grad_norm": 0.1327086091041565, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3635 + }, + { + "epoch": 2.9041533546325877, + "grad_norm": 0.08295682817697525, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3636 + }, + { + "epoch": 2.9049520766773163, + "grad_norm": 0.05216526240110397, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3637 + }, + { + "epoch": 2.905750798722045, + "grad_norm": 0.11048691719770432, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3638 + }, + { + "epoch": 2.9065495207667733, + "grad_norm": 0.17681372165679932, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3639 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.16901300847530365, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3640 + }, + { + "epoch": 2.90814696485623, + "grad_norm": 0.10261020064353943, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3641 + }, + { + "epoch": 2.9089456869009584, + "grad_norm": 0.042478349059820175, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3642 + }, + { + "epoch": 2.909744408945687, + "grad_norm": 0.11727496981620789, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3643 + }, + { + "epoch": 2.9105431309904155, + "grad_norm": 0.14884977042675018, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3644 + }, + { + "epoch": 2.911341853035144, + "grad_norm": 0.047877270728349686, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3645 + }, + { + "epoch": 2.912140575079872, + "grad_norm": 0.11930714547634125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3646 + }, + { + "epoch": 2.9129392971246006, + "grad_norm": 0.1873956024646759, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3647 + }, + { + "epoch": 2.913738019169329, + "grad_norm": 0.22310249507427216, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3648 + }, + { + "epoch": 2.9145367412140573, + "grad_norm": 0.21259911358356476, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3649 + }, + { + "epoch": 2.915335463258786, + "grad_norm": 0.11584217846393585, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3650 + }, + { + "epoch": 2.9161341853035143, + "grad_norm": 0.04092720150947571, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3651 + }, + { + "epoch": 2.916932907348243, + "grad_norm": 0.14542047679424286, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3652 + }, + { + "epoch": 2.9177316293929714, + "grad_norm": 0.16328515112400055, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3653 + }, + { + "epoch": 2.9185303514377, + "grad_norm": 0.11284583806991577, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3654 + }, + { + "epoch": 2.919329073482428, + "grad_norm": 0.03723357245326042, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3655 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.1347448229789734, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3656 + }, + { + "epoch": 2.920926517571885, + "grad_norm": 0.1697797328233719, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3657 + }, + { + "epoch": 2.9217252396166136, + "grad_norm": 0.12122484296560287, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3658 + }, + { + "epoch": 2.9225239616613417, + "grad_norm": 0.043503791093826294, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3659 + }, + { + "epoch": 2.92332268370607, + "grad_norm": 0.1600242555141449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3660 + }, + { + "epoch": 2.9241214057507987, + "grad_norm": 0.21065576374530792, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3661 + }, + { + "epoch": 2.9249201277955272, + "grad_norm": 0.16726253926753998, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3662 + }, + { + "epoch": 2.9257188498402558, + "grad_norm": 0.09178615361452103, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3663 + }, + { + "epoch": 2.9265175718849843, + "grad_norm": 0.0447201170027256, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3664 + }, + { + "epoch": 2.9273162939297124, + "grad_norm": 0.10462333261966705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3665 + }, + { + "epoch": 2.928115015974441, + "grad_norm": 0.08236772567033768, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3666 + }, + { + "epoch": 2.9289137380191694, + "grad_norm": 0.06551375985145569, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3667 + }, + { + "epoch": 2.9297124600638975, + "grad_norm": 0.1531982123851776, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3668 + }, + { + "epoch": 2.930511182108626, + "grad_norm": 0.19483166933059692, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3669 + }, + { + "epoch": 2.9313099041533546, + "grad_norm": 0.12347809225320816, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3670 + }, + { + "epoch": 2.932108626198083, + "grad_norm": 0.05494467169046402, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3671 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 0.2280847579240799, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3672 + }, + { + "epoch": 2.93370607028754, + "grad_norm": 0.30344241857528687, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3673 + }, + { + "epoch": 2.9345047923322682, + "grad_norm": 0.243449404835701, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3674 + }, + { + "epoch": 2.9353035143769968, + "grad_norm": 0.11542543768882751, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3675 + }, + { + "epoch": 2.9361022364217253, + "grad_norm": 0.09501481056213379, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3676 + }, + { + "epoch": 2.936900958466454, + "grad_norm": 0.2299363762140274, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3677 + }, + { + "epoch": 2.937699680511182, + "grad_norm": 0.15020152926445007, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3678 + }, + { + "epoch": 2.9384984025559104, + "grad_norm": 0.0655093789100647, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3679 + }, + { + "epoch": 2.939297124600639, + "grad_norm": 0.15242713689804077, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3680 + }, + { + "epoch": 2.9400958466453675, + "grad_norm": 0.13315139710903168, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3681 + }, + { + "epoch": 2.940894568690096, + "grad_norm": 0.05966462939977646, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3682 + }, + { + "epoch": 2.9416932907348246, + "grad_norm": 0.08146806806325912, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3683 + }, + { + "epoch": 2.9424920127795526, + "grad_norm": 0.13615436851978302, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3684 + }, + { + "epoch": 2.943290734824281, + "grad_norm": 0.10889092832803726, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3685 + }, + { + "epoch": 2.9440894568690097, + "grad_norm": 0.03455124795436859, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3686 + }, + { + "epoch": 2.9448881789137378, + "grad_norm": 0.07490532845258713, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3687 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.08072194457054138, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3688 + }, + { + "epoch": 2.946485623003195, + "grad_norm": 0.03630111739039421, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3689 + }, + { + "epoch": 2.9472843450479234, + "grad_norm": 0.09075939655303955, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3690 + }, + { + "epoch": 2.948083067092652, + "grad_norm": 0.1618475615978241, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3691 + }, + { + "epoch": 2.9488817891373804, + "grad_norm": 0.18354517221450806, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3692 + }, + { + "epoch": 2.9496805111821085, + "grad_norm": 0.170358344912529, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3693 + }, + { + "epoch": 2.950479233226837, + "grad_norm": 0.10800250619649887, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3694 + }, + { + "epoch": 2.9512779552715656, + "grad_norm": 0.03771398589015007, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3695 + }, + { + "epoch": 2.952076677316294, + "grad_norm": 0.07931157946586609, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3696 + }, + { + "epoch": 2.952875399361022, + "grad_norm": 0.08149557560682297, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3697 + }, + { + "epoch": 2.9536741214057507, + "grad_norm": 0.05122899264097214, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3698 + }, + { + "epoch": 2.9544728434504792, + "grad_norm": 0.040845707058906555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3699 + }, + { + "epoch": 2.9552715654952078, + "grad_norm": 0.11444225907325745, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3700 + }, + { + "epoch": 2.9560702875399363, + "grad_norm": 0.20140959322452545, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3701 + }, + { + "epoch": 2.956869009584665, + "grad_norm": 0.24982111155986786, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3702 + }, + { + "epoch": 2.957667731629393, + "grad_norm": 0.21290510892868042, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3703 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.11526014655828476, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3704 + }, + { + "epoch": 2.95926517571885, + "grad_norm": 0.03769242390990257, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3705 + }, + { + "epoch": 2.960063897763578, + "grad_norm": 0.091837577521801, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3706 + }, + { + "epoch": 2.9608626198083066, + "grad_norm": 0.0956759825348854, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3707 + }, + { + "epoch": 2.961661341853035, + "grad_norm": 0.06945781409740448, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3708 + }, + { + "epoch": 2.9624600638977636, + "grad_norm": 0.03904029354453087, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3709 + }, + { + "epoch": 2.963258785942492, + "grad_norm": 0.1264238953590393, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3710 + }, + { + "epoch": 2.9640575079872207, + "grad_norm": 0.1689605861902237, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3711 + }, + { + "epoch": 2.9648562300319488, + "grad_norm": 0.15059368312358856, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3712 + }, + { + "epoch": 2.9656549520766773, + "grad_norm": 0.12976346909999847, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3713 + }, + { + "epoch": 2.966453674121406, + "grad_norm": 0.08460741490125656, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3714 + }, + { + "epoch": 2.9672523961661343, + "grad_norm": 0.04914790764451027, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3715 + }, + { + "epoch": 2.9680511182108624, + "grad_norm": 0.09629235416650772, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3716 + }, + { + "epoch": 2.968849840255591, + "grad_norm": 0.0895731970667839, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 3717 + }, + { + "epoch": 2.9696485623003195, + "grad_norm": 0.039528124034404755, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3718 + }, + { + "epoch": 2.970447284345048, + "grad_norm": 0.12843455374240875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3719 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.1754530519247055, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3720 + }, + { + "epoch": 2.972044728434505, + "grad_norm": 0.14169782400131226, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3721 + }, + { + "epoch": 2.972843450479233, + "grad_norm": 0.04416975378990173, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3722 + }, + { + "epoch": 2.9736421725239617, + "grad_norm": 0.1259031444787979, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3723 + }, + { + "epoch": 2.97444089456869, + "grad_norm": 0.17667949199676514, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3724 + }, + { + "epoch": 2.9752396166134183, + "grad_norm": 0.1213974729180336, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3725 + }, + { + "epoch": 2.976038338658147, + "grad_norm": 0.052554335445165634, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3726 + }, + { + "epoch": 2.9768370607028753, + "grad_norm": 0.13257208466529846, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3727 + }, + { + "epoch": 2.977635782747604, + "grad_norm": 0.1463504135608673, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3728 + }, + { + "epoch": 2.9784345047923324, + "grad_norm": 0.08546306937932968, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3729 + }, + { + "epoch": 2.979233226837061, + "grad_norm": 0.04226094111800194, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3730 + }, + { + "epoch": 2.980031948881789, + "grad_norm": 0.0924859419465065, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3731 + }, + { + "epoch": 2.9808306709265175, + "grad_norm": 0.1094423234462738, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3732 + }, + { + "epoch": 2.981629392971246, + "grad_norm": 0.11132006347179413, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3733 + }, + { + "epoch": 2.9824281150159746, + "grad_norm": 0.11010250449180603, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3734 + }, + { + "epoch": 2.9832268370607027, + "grad_norm": 0.10370460152626038, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3735 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 0.08460240811109543, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3736 + }, + { + "epoch": 2.9848242811501597, + "grad_norm": 0.06218400225043297, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3737 + }, + { + "epoch": 2.9856230031948883, + "grad_norm": 0.07446395605802536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3738 + }, + { + "epoch": 2.986421725239617, + "grad_norm": 0.06072726845741272, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3739 + }, + { + "epoch": 2.987220447284345, + "grad_norm": 0.07607559114694595, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3740 + }, + { + "epoch": 2.9880191693290734, + "grad_norm": 0.151380717754364, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3741 + }, + { + "epoch": 2.988817891373802, + "grad_norm": 0.24132277071475983, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3742 + }, + { + "epoch": 2.9896166134185305, + "grad_norm": 0.2346547245979309, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3743 + }, + { + "epoch": 2.9904153354632586, + "grad_norm": 0.090092234313488, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3744 + }, + { + "epoch": 2.991214057507987, + "grad_norm": 0.10230003297328949, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3745 + }, + { + "epoch": 2.9920127795527156, + "grad_norm": 0.17678654193878174, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3746 + }, + { + "epoch": 2.992811501597444, + "grad_norm": 0.16382110118865967, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3747 + }, + { + "epoch": 2.9936102236421727, + "grad_norm": 0.06456442922353745, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3748 + }, + { + "epoch": 2.994408945686901, + "grad_norm": 0.1774967759847641, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3749 + }, + { + "epoch": 2.9952076677316293, + "grad_norm": 0.19274447858333588, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3750 + }, + { + "epoch": 2.996006389776358, + "grad_norm": 0.10767998546361923, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3751 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.07864238321781158, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3752 + }, + { + "epoch": 2.997603833865815, + "grad_norm": 0.21339190006256104, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3753 + }, + { + "epoch": 2.998402555910543, + "grad_norm": 0.2560347616672516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3754 + }, + { + "epoch": 2.9992012779552715, + "grad_norm": 0.15730907022953033, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3755 + }, + { + "epoch": 3.0, + "grad_norm": 0.09766457974910736, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3756 + }, + { + "epoch": 3.0007987220447285, + "grad_norm": 0.24393433332443237, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3757 + }, + { + "epoch": 3.001597444089457, + "grad_norm": 0.17650263011455536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3758 + }, + { + "epoch": 3.002396166134185, + "grad_norm": 0.06490518152713776, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3759 + }, + { + "epoch": 3.0031948881789137, + "grad_norm": 0.10893388092517853, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3760 + }, + { + "epoch": 3.003993610223642, + "grad_norm": 0.13606922328472137, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3761 + }, + { + "epoch": 3.0047923322683707, + "grad_norm": 0.07880546152591705, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3762 + }, + { + "epoch": 3.0055910543130993, + "grad_norm": 0.04203686863183975, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3763 + }, + { + "epoch": 3.0063897763578273, + "grad_norm": 0.07509997487068176, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3764 + }, + { + "epoch": 3.007188498402556, + "grad_norm": 0.08529910445213318, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3765 + }, + { + "epoch": 3.0079872204472844, + "grad_norm": 0.05542825534939766, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3766 + }, + { + "epoch": 3.008785942492013, + "grad_norm": 0.08245155215263367, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3767 + }, + { + "epoch": 3.009584664536741, + "grad_norm": 0.09580255299806595, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3768 + }, + { + "epoch": 3.0103833865814695, + "grad_norm": 0.08233854174613953, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3769 + }, + { + "epoch": 3.011182108626198, + "grad_norm": 0.0589553639292717, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3770 + }, + { + "epoch": 3.0119808306709266, + "grad_norm": 0.09862494468688965, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3771 + }, + { + "epoch": 3.012779552715655, + "grad_norm": 0.1471278816461563, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3772 + }, + { + "epoch": 3.013578274760383, + "grad_norm": 0.1422986537218094, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3773 + }, + { + "epoch": 3.0143769968051117, + "grad_norm": 0.06627846509218216, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3774 + }, + { + "epoch": 3.0151757188498403, + "grad_norm": 0.04936077445745468, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3775 + }, + { + "epoch": 3.015974440894569, + "grad_norm": 0.0745953619480133, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3776 + }, + { + "epoch": 3.0167731629392973, + "grad_norm": 0.0725102499127388, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3777 + }, + { + "epoch": 3.0175718849840254, + "grad_norm": 0.04181717708706856, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3778 + }, + { + "epoch": 3.018370607028754, + "grad_norm": 0.09955357760190964, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3779 + }, + { + "epoch": 3.0191693290734825, + "grad_norm": 0.21014735102653503, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3780 + }, + { + "epoch": 3.019968051118211, + "grad_norm": 0.30597689747810364, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3781 + }, + { + "epoch": 3.0207667731629395, + "grad_norm": 0.2930602431297302, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3782 + }, + { + "epoch": 3.0215654952076676, + "grad_norm": 0.1190100908279419, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3783 + }, + { + "epoch": 3.022364217252396, + "grad_norm": 0.0655524879693985, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3784 + }, + { + "epoch": 3.0231629392971247, + "grad_norm": 0.12062554061412811, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3785 + }, + { + "epoch": 3.023961661341853, + "grad_norm": 0.09680327773094177, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3786 + }, + { + "epoch": 3.0247603833865813, + "grad_norm": 0.0555860660970211, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3787 + }, + { + "epoch": 3.02555910543131, + "grad_norm": 0.1271962672472, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3788 + }, + { + "epoch": 3.0263578274760383, + "grad_norm": 0.12178758531808853, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3789 + }, + { + "epoch": 3.027156549520767, + "grad_norm": 0.09623143821954727, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3790 + }, + { + "epoch": 3.0279552715654954, + "grad_norm": 0.04004101827740669, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3791 + }, + { + "epoch": 3.0287539936102235, + "grad_norm": 0.14001014828681946, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3792 + }, + { + "epoch": 3.029552715654952, + "grad_norm": 0.24241770803928375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3793 + }, + { + "epoch": 3.0303514376996805, + "grad_norm": 0.29141902923583984, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3794 + }, + { + "epoch": 3.031150159744409, + "grad_norm": 0.22814971208572388, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 3795 + }, + { + "epoch": 3.0319488817891376, + "grad_norm": 0.08114828914403915, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3796 + }, + { + "epoch": 3.0327476038338657, + "grad_norm": 0.08104736357927322, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3797 + }, + { + "epoch": 3.033546325878594, + "grad_norm": 0.12007702887058258, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3798 + }, + { + "epoch": 3.0343450479233227, + "grad_norm": 0.06497872620820999, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3799 + }, + { + "epoch": 3.0351437699680512, + "grad_norm": 0.07407233864068985, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3800 + }, + { + "epoch": 3.0359424920127798, + "grad_norm": 0.16386932134628296, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3801 + }, + { + "epoch": 3.036741214057508, + "grad_norm": 0.21633599698543549, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3802 + }, + { + "epoch": 3.0375399361022364, + "grad_norm": 0.19224147498607635, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3803 + }, + { + "epoch": 3.038338658146965, + "grad_norm": 0.04962728172540665, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3804 + }, + { + "epoch": 3.0391373801916934, + "grad_norm": 0.17984353005886078, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3805 + }, + { + "epoch": 3.0399361022364215, + "grad_norm": 0.31483346223831177, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3806 + }, + { + "epoch": 3.04073482428115, + "grad_norm": 0.27175095677375793, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3807 + }, + { + "epoch": 3.0415335463258786, + "grad_norm": 0.06302175670862198, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3808 + }, + { + "epoch": 3.042332268370607, + "grad_norm": 0.18620255589485168, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3809 + }, + { + "epoch": 3.0431309904153356, + "grad_norm": 0.23254868388175964, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3810 + }, + { + "epoch": 3.0439297124600637, + "grad_norm": 0.08763844519853592, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3811 + }, + { + "epoch": 3.0447284345047922, + "grad_norm": 0.13173392415046692, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3812 + }, + { + "epoch": 3.0455271565495208, + "grad_norm": 0.24171577394008636, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3813 + }, + { + "epoch": 3.0463258785942493, + "grad_norm": 0.17649634182453156, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3814 + }, + { + "epoch": 3.047124600638978, + "grad_norm": 0.03800780326128006, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3815 + }, + { + "epoch": 3.047923322683706, + "grad_norm": 0.20039476454257965, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3816 + }, + { + "epoch": 3.0487220447284344, + "grad_norm": 0.26794761419296265, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 3817 + }, + { + "epoch": 3.049520766773163, + "grad_norm": 0.18026290833950043, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3818 + }, + { + "epoch": 3.0503194888178915, + "grad_norm": 0.07298897206783295, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 3819 + }, + { + "epoch": 3.0511182108626196, + "grad_norm": 0.11078597605228424, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 3820 + }, + { + "epoch": 3.051916932907348, + "grad_norm": 0.13672129809856415, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3821 + }, + { + "epoch": 3.0527156549520766, + "grad_norm": 0.11172370612621307, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3822 + }, + { + "epoch": 3.053514376996805, + "grad_norm": 0.09000302106142044, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3823 + }, + { + "epoch": 3.0543130990415337, + "grad_norm": 0.055291030555963516, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3824 + }, + { + "epoch": 3.055111821086262, + "grad_norm": 0.05691349133849144, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3825 + }, + { + "epoch": 3.0559105431309903, + "grad_norm": 0.0744122862815857, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3826 + }, + { + "epoch": 3.056709265175719, + "grad_norm": 0.06438847631216049, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3827 + }, + { + "epoch": 3.0575079872204474, + "grad_norm": 0.0926717221736908, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3828 + }, + { + "epoch": 3.058306709265176, + "grad_norm": 0.15286727249622345, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3829 + }, + { + "epoch": 3.059105431309904, + "grad_norm": 0.2049989253282547, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3830 + }, + { + "epoch": 3.0599041533546325, + "grad_norm": 0.1832154393196106, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3831 + }, + { + "epoch": 3.060702875399361, + "grad_norm": 0.0953374058008194, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3832 + }, + { + "epoch": 3.0615015974440896, + "grad_norm": 0.063878633081913, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3833 + }, + { + "epoch": 3.062300319488818, + "grad_norm": 0.17062409222126007, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3834 + }, + { + "epoch": 3.063099041533546, + "grad_norm": 0.23467828333377838, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3835 + }, + { + "epoch": 3.0638977635782747, + "grad_norm": 0.19458062946796417, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3836 + }, + { + "epoch": 3.0646964856230032, + "grad_norm": 0.06614453345537186, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3837 + }, + { + "epoch": 3.0654952076677318, + "grad_norm": 0.1250256896018982, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3838 + }, + { + "epoch": 3.06629392971246, + "grad_norm": 0.2399163395166397, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3839 + }, + { + "epoch": 3.0670926517571884, + "grad_norm": 0.22544947266578674, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3840 + }, + { + "epoch": 3.067891373801917, + "grad_norm": 0.0710826963186264, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3841 + }, + { + "epoch": 3.0686900958466454, + "grad_norm": 0.12259501218795776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3842 + }, + { + "epoch": 3.069488817891374, + "grad_norm": 0.1313357651233673, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3843 + }, + { + "epoch": 3.070287539936102, + "grad_norm": 0.05492740869522095, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3844 + }, + { + "epoch": 3.0710862619808306, + "grad_norm": 0.08860959857702255, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3845 + }, + { + "epoch": 3.071884984025559, + "grad_norm": 0.12556305527687073, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3846 + }, + { + "epoch": 3.0726837060702876, + "grad_norm": 0.10780923813581467, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3847 + }, + { + "epoch": 3.073482428115016, + "grad_norm": 0.0587402880191803, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3848 + }, + { + "epoch": 3.0742811501597442, + "grad_norm": 0.06155085563659668, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3849 + }, + { + "epoch": 3.0750798722044728, + "grad_norm": 0.07258733361959457, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3850 + }, + { + "epoch": 3.0758785942492013, + "grad_norm": 0.060939520597457886, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3851 + }, + { + "epoch": 3.07667731629393, + "grad_norm": 0.07125407457351685, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3852 + }, + { + "epoch": 3.0774760383386583, + "grad_norm": 0.15338753163814545, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3853 + }, + { + "epoch": 3.0782747603833864, + "grad_norm": 0.18328991532325745, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3854 + }, + { + "epoch": 3.079073482428115, + "grad_norm": 0.1338629275560379, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3855 + }, + { + "epoch": 3.0798722044728435, + "grad_norm": 0.042017024010419846, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3856 + }, + { + "epoch": 3.080670926517572, + "grad_norm": 0.13696196675300598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3857 + }, + { + "epoch": 3.0814696485623, + "grad_norm": 0.17552919685840607, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3858 + }, + { + "epoch": 3.0822683706070286, + "grad_norm": 0.09906235337257385, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3859 + }, + { + "epoch": 3.083067092651757, + "grad_norm": 0.057398926466703415, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3860 + }, + { + "epoch": 3.0838658146964857, + "grad_norm": 0.12260781973600388, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3861 + }, + { + "epoch": 3.084664536741214, + "grad_norm": 0.12672549486160278, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3862 + }, + { + "epoch": 3.0854632587859423, + "grad_norm": 0.07239031046628952, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3863 + }, + { + "epoch": 3.086261980830671, + "grad_norm": 0.0928259864449501, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3864 + }, + { + "epoch": 3.0870607028753994, + "grad_norm": 0.2161056250333786, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3865 + }, + { + "epoch": 3.087859424920128, + "grad_norm": 0.21302388608455658, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3866 + }, + { + "epoch": 3.0886581469648564, + "grad_norm": 0.10730110853910446, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3867 + }, + { + "epoch": 3.0894568690095845, + "grad_norm": 0.06801975518465042, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3868 + }, + { + "epoch": 3.090255591054313, + "grad_norm": 0.09036632627248764, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3869 + }, + { + "epoch": 3.0910543130990416, + "grad_norm": 0.1344052255153656, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3870 + }, + { + "epoch": 3.09185303514377, + "grad_norm": 0.10774482041597366, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3871 + }, + { + "epoch": 3.0926517571884986, + "grad_norm": 0.06824023276567459, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3872 + }, + { + "epoch": 3.0934504792332267, + "grad_norm": 0.11959507316350937, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3873 + }, + { + "epoch": 3.094249201277955, + "grad_norm": 0.14943768084049225, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3874 + }, + { + "epoch": 3.0950479233226837, + "grad_norm": 0.13593481481075287, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3875 + }, + { + "epoch": 3.0958466453674123, + "grad_norm": 0.06872473657131195, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3876 + }, + { + "epoch": 3.0966453674121404, + "grad_norm": 0.07243353873491287, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3877 + }, + { + "epoch": 3.097444089456869, + "grad_norm": 0.07884293049573898, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3878 + }, + { + "epoch": 3.0982428115015974, + "grad_norm": 0.09574474394321442, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3879 + }, + { + "epoch": 3.099041533546326, + "grad_norm": 0.09028270840644836, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3880 + }, + { + "epoch": 3.0998402555910545, + "grad_norm": 0.056680940091609955, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3881 + }, + { + "epoch": 3.1006389776357826, + "grad_norm": 0.13817615807056427, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3882 + }, + { + "epoch": 3.101437699680511, + "grad_norm": 0.16102705895900726, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3883 + }, + { + "epoch": 3.1022364217252396, + "grad_norm": 0.08887791633605957, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3884 + }, + { + "epoch": 3.103035143769968, + "grad_norm": 0.055100735276937485, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3885 + }, + { + "epoch": 3.1038338658146967, + "grad_norm": 0.10710839927196503, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3886 + }, + { + "epoch": 3.1046325878594248, + "grad_norm": 0.09228713810443878, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3887 + }, + { + "epoch": 3.1054313099041533, + "grad_norm": 0.04602783918380737, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3888 + }, + { + "epoch": 3.106230031948882, + "grad_norm": 0.03584764152765274, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3889 + }, + { + "epoch": 3.1070287539936103, + "grad_norm": 0.04486532881855965, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3890 + }, + { + "epoch": 3.107827476038339, + "grad_norm": 0.036488354206085205, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3891 + }, + { + "epoch": 3.108626198083067, + "grad_norm": 0.04213477671146393, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3892 + }, + { + "epoch": 3.1094249201277955, + "grad_norm": 0.03840509057044983, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 3893 + }, + { + "epoch": 3.110223642172524, + "grad_norm": 0.04800419509410858, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3894 + }, + { + "epoch": 3.1110223642172525, + "grad_norm": 0.06467507034540176, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3895 + }, + { + "epoch": 3.1118210862619806, + "grad_norm": 0.05736416578292847, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3896 + }, + { + "epoch": 3.112619808306709, + "grad_norm": 0.03337489813566208, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3897 + }, + { + "epoch": 3.1134185303514377, + "grad_norm": 0.088229238986969, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3898 + }, + { + "epoch": 3.114217252396166, + "grad_norm": 0.1492392122745514, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3899 + }, + { + "epoch": 3.1150159744408947, + "grad_norm": 0.1699269413948059, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3900 + }, + { + "epoch": 3.115814696485623, + "grad_norm": 0.11532948911190033, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3901 + }, + { + "epoch": 3.1166134185303513, + "grad_norm": 0.030054764822125435, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3902 + }, + { + "epoch": 3.11741214057508, + "grad_norm": 0.11079075932502747, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 3903 + }, + { + "epoch": 3.1182108626198084, + "grad_norm": 0.15733082592487335, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3904 + }, + { + "epoch": 3.119009584664537, + "grad_norm": 0.12520034611225128, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 3905 + }, + { + "epoch": 3.119808306709265, + "grad_norm": 0.03382280096411705, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3906 + }, + { + "epoch": 3.1206070287539935, + "grad_norm": 0.11951576173305511, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3907 + }, + { + "epoch": 3.121405750798722, + "grad_norm": 0.2123839259147644, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3908 + }, + { + "epoch": 3.1222044728434506, + "grad_norm": 0.15437674522399902, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3909 + }, + { + "epoch": 3.123003194888179, + "grad_norm": 0.06463608890771866, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3910 + }, + { + "epoch": 3.123801916932907, + "grad_norm": 0.10830746591091156, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3911 + }, + { + "epoch": 3.1246006389776357, + "grad_norm": 0.17621003091335297, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3912 + }, + { + "epoch": 3.1253993610223643, + "grad_norm": 0.12417379021644592, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3913 + }, + { + "epoch": 3.126198083067093, + "grad_norm": 0.05364898219704628, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3914 + }, + { + "epoch": 3.126996805111821, + "grad_norm": 0.17589502036571503, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3915 + }, + { + "epoch": 3.1277955271565494, + "grad_norm": 0.249656081199646, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3916 + }, + { + "epoch": 3.128594249201278, + "grad_norm": 0.1800973266363144, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3917 + }, + { + "epoch": 3.1293929712460065, + "grad_norm": 0.09763745218515396, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3918 + }, + { + "epoch": 3.130191693290735, + "grad_norm": 0.10953835397958755, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3919 + }, + { + "epoch": 3.130990415335463, + "grad_norm": 0.17490456998348236, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3920 + }, + { + "epoch": 3.1317891373801916, + "grad_norm": 0.11533153057098389, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3921 + }, + { + "epoch": 3.13258785942492, + "grad_norm": 0.07494231313467026, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3922 + }, + { + "epoch": 3.1333865814696487, + "grad_norm": 0.14954763650894165, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3923 + }, + { + "epoch": 3.134185303514377, + "grad_norm": 0.18061646819114685, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3924 + }, + { + "epoch": 3.1349840255591053, + "grad_norm": 0.10419650375843048, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3925 + }, + { + "epoch": 3.135782747603834, + "grad_norm": 0.04677566513419151, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3926 + }, + { + "epoch": 3.1365814696485623, + "grad_norm": 0.12846903502941132, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3927 + }, + { + "epoch": 3.137380191693291, + "grad_norm": 0.11824795603752136, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3928 + }, + { + "epoch": 3.1381789137380194, + "grad_norm": 0.04194530099630356, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3929 + }, + { + "epoch": 3.1389776357827475, + "grad_norm": 0.15154412388801575, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3930 + }, + { + "epoch": 3.139776357827476, + "grad_norm": 0.19073615968227386, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3931 + }, + { + "epoch": 3.1405750798722045, + "grad_norm": 0.12614648044109344, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3932 + }, + { + "epoch": 3.141373801916933, + "grad_norm": 0.03434520214796066, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3933 + }, + { + "epoch": 3.142172523961661, + "grad_norm": 0.11913489550352097, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3934 + }, + { + "epoch": 3.1429712460063897, + "grad_norm": 0.16297172009944916, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3935 + }, + { + "epoch": 3.143769968051118, + "grad_norm": 0.15605789422988892, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3936 + }, + { + "epoch": 3.1445686900958467, + "grad_norm": 0.10524406284093857, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3937 + }, + { + "epoch": 3.1453674121405752, + "grad_norm": 0.03763152286410332, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3938 + }, + { + "epoch": 3.1461661341853033, + "grad_norm": 0.07586465775966644, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3939 + }, + { + "epoch": 3.146964856230032, + "grad_norm": 0.14553581178188324, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3940 + }, + { + "epoch": 3.1477635782747604, + "grad_norm": 0.1883595883846283, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3941 + }, + { + "epoch": 3.148562300319489, + "grad_norm": 0.13018599152565002, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3942 + }, + { + "epoch": 3.1493610223642174, + "grad_norm": 0.05356704071164131, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3943 + }, + { + "epoch": 3.1501597444089455, + "grad_norm": 0.2083088606595993, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3944 + }, + { + "epoch": 3.150958466453674, + "grad_norm": 0.2586681544780731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3945 + }, + { + "epoch": 3.1517571884984026, + "grad_norm": 0.18733063340187073, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3946 + }, + { + "epoch": 3.152555910543131, + "grad_norm": 0.03741752356290817, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3947 + }, + { + "epoch": 3.1533546325878596, + "grad_norm": 0.11660216003656387, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3948 + }, + { + "epoch": 3.1541533546325877, + "grad_norm": 0.12698383629322052, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3949 + }, + { + "epoch": 3.1549520766773163, + "grad_norm": 0.10244922339916229, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3950 + }, + { + "epoch": 3.155750798722045, + "grad_norm": 0.03815237060189247, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3951 + }, + { + "epoch": 3.1565495207667733, + "grad_norm": 0.04394761845469475, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3952 + }, + { + "epoch": 3.1573482428115014, + "grad_norm": 0.1344541311264038, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3953 + }, + { + "epoch": 3.15814696485623, + "grad_norm": 0.23006947338581085, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3954 + }, + { + "epoch": 3.1589456869009584, + "grad_norm": 0.2667021155357361, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3955 + }, + { + "epoch": 3.159744408945687, + "grad_norm": 0.2410362809896469, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3956 + }, + { + "epoch": 3.1605431309904155, + "grad_norm": 0.1421661078929901, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3957 + }, + { + "epoch": 3.1613418530351436, + "grad_norm": 0.04178561642765999, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3958 + }, + { + "epoch": 3.162140575079872, + "grad_norm": 0.15327088534832, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3959 + }, + { + "epoch": 3.1629392971246006, + "grad_norm": 0.1372532993555069, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3960 + }, + { + "epoch": 3.163738019169329, + "grad_norm": 0.03763817250728607, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3961 + }, + { + "epoch": 3.1645367412140577, + "grad_norm": 0.13227587938308716, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3962 + }, + { + "epoch": 3.165335463258786, + "grad_norm": 0.1952073723077774, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3963 + }, + { + "epoch": 3.1661341853035143, + "grad_norm": 0.1672048568725586, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3964 + }, + { + "epoch": 3.166932907348243, + "grad_norm": 0.09593698382377625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3965 + }, + { + "epoch": 3.1677316293929714, + "grad_norm": 0.03619454428553581, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3966 + }, + { + "epoch": 3.1685303514377, + "grad_norm": 0.05974683538079262, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3967 + }, + { + "epoch": 3.169329073482428, + "grad_norm": 0.09733424335718155, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3968 + }, + { + "epoch": 3.1701277955271565, + "grad_norm": 0.07536087185144424, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3969 + }, + { + "epoch": 3.170926517571885, + "grad_norm": 0.04263869300484657, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3970 + }, + { + "epoch": 3.1717252396166136, + "grad_norm": 0.040521468967199326, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3971 + }, + { + "epoch": 3.1725239616613417, + "grad_norm": 0.05615096539258957, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3972 + }, + { + "epoch": 3.17332268370607, + "grad_norm": 0.06655194610357285, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3973 + }, + { + "epoch": 3.1741214057507987, + "grad_norm": 0.07300302386283875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3974 + }, + { + "epoch": 3.1749201277955272, + "grad_norm": 0.04789174720644951, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3975 + }, + { + "epoch": 3.1757188498402558, + "grad_norm": 0.03460157290101051, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3976 + }, + { + "epoch": 3.176517571884984, + "grad_norm": 0.0393557995557785, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3977 + }, + { + "epoch": 3.1773162939297124, + "grad_norm": 0.062453389167785645, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3978 + }, + { + "epoch": 3.178115015974441, + "grad_norm": 0.08542043715715408, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3979 + }, + { + "epoch": 3.1789137380191694, + "grad_norm": 0.08002828061580658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3980 + }, + { + "epoch": 3.179712460063898, + "grad_norm": 0.04635196551680565, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3981 + }, + { + "epoch": 3.180511182108626, + "grad_norm": 0.09583642333745956, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3982 + }, + { + "epoch": 3.1813099041533546, + "grad_norm": 0.12418454885482788, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3983 + }, + { + "epoch": 3.182108626198083, + "grad_norm": 0.10457618534564972, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3984 + }, + { + "epoch": 3.1829073482428116, + "grad_norm": 0.07183804363012314, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3985 + }, + { + "epoch": 3.18370607028754, + "grad_norm": 0.039956409484148026, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3986 + }, + { + "epoch": 3.1845047923322682, + "grad_norm": 0.0884016826748848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3987 + }, + { + "epoch": 3.1853035143769968, + "grad_norm": 0.112494558095932, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3988 + }, + { + "epoch": 3.1861022364217253, + "grad_norm": 0.07582054287195206, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3989 + }, + { + "epoch": 3.186900958466454, + "grad_norm": 0.060303278267383575, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3990 + }, + { + "epoch": 3.187699680511182, + "grad_norm": 0.048326775431632996, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3991 + }, + { + "epoch": 3.1884984025559104, + "grad_norm": 0.32322436571121216, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3992 + }, + { + "epoch": 3.189297124600639, + "grad_norm": 0.5569815039634705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3993 + }, + { + "epoch": 3.1900958466453675, + "grad_norm": 0.7590563893318176, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 3994 + }, + { + "epoch": 3.190894568690096, + "grad_norm": 0.6537879705429077, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3995 + }, + { + "epoch": 3.191693290734824, + "grad_norm": 0.16556645929813385, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3996 + }, + { + "epoch": 3.1924920127795526, + "grad_norm": 0.3745940625667572, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3997 + }, + { + "epoch": 3.193290734824281, + "grad_norm": 0.5159009695053101, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 3998 + }, + { + "epoch": 3.1940894568690097, + "grad_norm": 0.1302756816148758, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3999 + }, + { + "epoch": 3.194888178913738, + "grad_norm": 0.3484213054180145, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4000 + }, + { + "epoch": 3.1956869009584663, + "grad_norm": 0.23763029277324677, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4001 + }, + { + "epoch": 3.196485623003195, + "grad_norm": 0.20648746192455292, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4002 + }, + { + "epoch": 3.1972843450479234, + "grad_norm": 0.31230399012565613, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4003 + }, + { + "epoch": 3.198083067092652, + "grad_norm": 0.15389247238636017, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4004 + }, + { + "epoch": 3.1988817891373804, + "grad_norm": 0.6544334292411804, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4005 + }, + { + "epoch": 3.1996805111821085, + "grad_norm": 0.5409669280052185, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4006 + }, + { + "epoch": 3.200479233226837, + "grad_norm": 0.11126074194908142, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4007 + }, + { + "epoch": 3.2012779552715656, + "grad_norm": 0.3257724642753601, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4008 + }, + { + "epoch": 3.202076677316294, + "grad_norm": 0.4188903272151947, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4009 + }, + { + "epoch": 3.202875399361022, + "grad_norm": 0.1012830138206482, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4010 + }, + { + "epoch": 3.2036741214057507, + "grad_norm": 0.2771216034889221, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4011 + }, + { + "epoch": 3.2044728434504792, + "grad_norm": 0.2873278260231018, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4012 + }, + { + "epoch": 3.2052715654952078, + "grad_norm": 0.09620041400194168, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4013 + }, + { + "epoch": 3.2060702875399363, + "grad_norm": 0.10561787337064743, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4014 + }, + { + "epoch": 3.2068690095846644, + "grad_norm": 0.12499046325683594, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4015 + }, + { + "epoch": 3.207667731629393, + "grad_norm": 0.4055064916610718, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4016 + }, + { + "epoch": 3.2084664536741214, + "grad_norm": 0.9722099900245667, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 4017 + }, + { + "epoch": 3.20926517571885, + "grad_norm": 0.7367122173309326, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 4018 + }, + { + "epoch": 3.2100638977635785, + "grad_norm": 0.4455755650997162, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4019 + }, + { + "epoch": 3.2108626198083066, + "grad_norm": 0.10350961983203888, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4020 + }, + { + "epoch": 3.211661341853035, + "grad_norm": 0.41901662945747375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4021 + }, + { + "epoch": 3.2124600638977636, + "grad_norm": 0.5987749695777893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4022 + }, + { + "epoch": 3.213258785942492, + "grad_norm": 1.5967272520065308, + "learning_rate": 0.0005, + "loss": 1.1938, + "step": 4023 + }, + { + "epoch": 3.2140575079872207, + "grad_norm": 3.289113759994507, + "learning_rate": 0.0005, + "loss": 1.2474, + "step": 4024 + }, + { + "epoch": 3.2148562300319488, + "grad_norm": 0.40220701694488525, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 4025 + }, + { + "epoch": 3.2156549520766773, + "grad_norm": 0.15129008889198303, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4026 + }, + { + "epoch": 3.216453674121406, + "grad_norm": 19.060272216796875, + "learning_rate": 0.0005, + "loss": 1.4668, + "step": 4027 + }, + { + "epoch": 3.2172523961661343, + "grad_norm": 1.72987961769104, + "learning_rate": 0.0005, + "loss": 1.3675, + "step": 4028 + }, + { + "epoch": 3.2180511182108624, + "grad_norm": 2.1064836978912354, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 4029 + }, + { + "epoch": 3.218849840255591, + "grad_norm": 1.0206952095031738, + "learning_rate": 0.0005, + "loss": 1.2602, + "step": 4030 + }, + { + "epoch": 3.2196485623003195, + "grad_norm": 14.109564781188965, + "learning_rate": 0.0005, + "loss": 3.9831, + "step": 4031 + }, + { + "epoch": 3.220447284345048, + "grad_norm": 12.518637657165527, + "learning_rate": 0.0005, + "loss": 3.4388, + "step": 4032 + }, + { + "epoch": 3.2212460063897765, + "grad_norm": 4.156238079071045, + "learning_rate": 0.0005, + "loss": 2.1713, + "step": 4033 + }, + { + "epoch": 3.2220447284345046, + "grad_norm": 2.752128839492798, + "learning_rate": 0.0005, + "loss": 1.6581, + "step": 4034 + }, + { + "epoch": 3.222843450479233, + "grad_norm": 5.876696586608887, + "learning_rate": 0.0005, + "loss": 2.1698, + "step": 4035 + }, + { + "epoch": 3.2236421725239617, + "grad_norm": 7.60305118560791, + "learning_rate": 0.0005, + "loss": 3.0713, + "step": 4036 + }, + { + "epoch": 3.22444089456869, + "grad_norm": 2.581448554992676, + "learning_rate": 0.0005, + "loss": 1.7677, + "step": 4037 + }, + { + "epoch": 3.2252396166134187, + "grad_norm": 1.0544116497039795, + "learning_rate": 0.0005, + "loss": 1.4604, + "step": 4038 + }, + { + "epoch": 3.226038338658147, + "grad_norm": 10.742961883544922, + "learning_rate": 0.0005, + "loss": 3.8634, + "step": 4039 + }, + { + "epoch": 3.2268370607028753, + "grad_norm": 6.555435657501221, + "learning_rate": 0.0005, + "loss": 2.7229, + "step": 4040 + }, + { + "epoch": 3.227635782747604, + "grad_norm": 4.335379600524902, + "learning_rate": 0.0005, + "loss": 2.548, + "step": 4041 + }, + { + "epoch": 3.2284345047923324, + "grad_norm": 3.9863200187683105, + "learning_rate": 0.0005, + "loss": 2.5051, + "step": 4042 + }, + { + "epoch": 3.229233226837061, + "grad_norm": 3.4922895431518555, + "learning_rate": 0.0005, + "loss": 2.1996, + "step": 4043 + }, + { + "epoch": 3.230031948881789, + "grad_norm": 0.9404768347740173, + "learning_rate": 0.0005, + "loss": 1.7869, + "step": 4044 + }, + { + "epoch": 3.2308306709265175, + "grad_norm": 1.2953938245773315, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 4045 + }, + { + "epoch": 3.231629392971246, + "grad_norm": 2.0215165615081787, + "learning_rate": 0.0005, + "loss": 1.9429, + "step": 4046 + }, + { + "epoch": 3.2324281150159746, + "grad_norm": 1.2744032144546509, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 4047 + }, + { + "epoch": 3.2332268370607027, + "grad_norm": 2.042656660079956, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 4048 + }, + { + "epoch": 3.234025559105431, + "grad_norm": 6.607172012329102, + "learning_rate": 0.0005, + "loss": 2.8381, + "step": 4049 + }, + { + "epoch": 3.2348242811501597, + "grad_norm": 1.2499932050704956, + "learning_rate": 0.0005, + "loss": 1.6324, + "step": 4050 + }, + { + "epoch": 3.2356230031948883, + "grad_norm": 1.1896424293518066, + "learning_rate": 0.0005, + "loss": 1.7201, + "step": 4051 + }, + { + "epoch": 3.236421725239617, + "grad_norm": 1.9901418685913086, + "learning_rate": 0.0005, + "loss": 1.7335, + "step": 4052 + }, + { + "epoch": 3.237220447284345, + "grad_norm": 0.8886330127716064, + "learning_rate": 0.0005, + "loss": 1.5111, + "step": 4053 + }, + { + "epoch": 3.2380191693290734, + "grad_norm": 2.6570353507995605, + "learning_rate": 0.0005, + "loss": 1.8628, + "step": 4054 + }, + { + "epoch": 3.238817891373802, + "grad_norm": 2.212905168533325, + "learning_rate": 0.0005, + "loss": 1.5838, + "step": 4055 + }, + { + "epoch": 3.2396166134185305, + "grad_norm": 3.1234660148620605, + "learning_rate": 0.0005, + "loss": 1.7212, + "step": 4056 + }, + { + "epoch": 3.2404153354632586, + "grad_norm": 0.9168338775634766, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 4057 + }, + { + "epoch": 3.241214057507987, + "grad_norm": 0.8366042971611023, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 4058 + }, + { + "epoch": 3.2420127795527156, + "grad_norm": 0.5359059572219849, + "learning_rate": 0.0005, + "loss": 1.4185, + "step": 4059 + }, + { + "epoch": 3.242811501597444, + "grad_norm": 1.8511804342269897, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 4060 + }, + { + "epoch": 3.2436102236421727, + "grad_norm": 1.3229485750198364, + "learning_rate": 0.0005, + "loss": 1.4497, + "step": 4061 + }, + { + "epoch": 3.244408945686901, + "grad_norm": 0.8846393823623657, + "learning_rate": 0.0005, + "loss": 1.384, + "step": 4062 + }, + { + "epoch": 3.2452076677316293, + "grad_norm": 1.1345176696777344, + "learning_rate": 0.0005, + "loss": 1.3906, + "step": 4063 + }, + { + "epoch": 3.246006389776358, + "grad_norm": 0.998261034488678, + "learning_rate": 0.0005, + "loss": 1.3807, + "step": 4064 + }, + { + "epoch": 3.2468051118210863, + "grad_norm": 0.8998358249664307, + "learning_rate": 0.0005, + "loss": 1.3321, + "step": 4065 + }, + { + "epoch": 3.247603833865815, + "grad_norm": 0.6892838478088379, + "learning_rate": 0.0005, + "loss": 1.3718, + "step": 4066 + }, + { + "epoch": 3.248402555910543, + "grad_norm": 0.515389084815979, + "learning_rate": 0.0005, + "loss": 1.3296, + "step": 4067 + }, + { + "epoch": 3.2492012779552715, + "grad_norm": 0.41038376092910767, + "learning_rate": 0.0005, + "loss": 1.2855, + "step": 4068 + }, + { + "epoch": 3.25, + "grad_norm": 0.6094494462013245, + "learning_rate": 0.0005, + "loss": 1.2953, + "step": 4069 + }, + { + "epoch": 3.2507987220447285, + "grad_norm": 0.6274027228355408, + "learning_rate": 0.0005, + "loss": 1.2879, + "step": 4070 + }, + { + "epoch": 3.251597444089457, + "grad_norm": 0.8833006024360657, + "learning_rate": 0.0005, + "loss": 1.2806, + "step": 4071 + }, + { + "epoch": 3.252396166134185, + "grad_norm": 0.8688742518424988, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 4072 + }, + { + "epoch": 3.2531948881789137, + "grad_norm": 0.34751075506210327, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 4073 + }, + { + "epoch": 3.253993610223642, + "grad_norm": 0.4245823621749878, + "learning_rate": 0.0005, + "loss": 1.2529, + "step": 4074 + }, + { + "epoch": 3.2547923322683707, + "grad_norm": 0.4495961368083954, + "learning_rate": 0.0005, + "loss": 1.2438, + "step": 4075 + }, + { + "epoch": 3.255591054313099, + "grad_norm": 0.683125913143158, + "learning_rate": 0.0005, + "loss": 1.2297, + "step": 4076 + }, + { + "epoch": 3.2563897763578273, + "grad_norm": 0.4342438876628876, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 4077 + }, + { + "epoch": 3.257188498402556, + "grad_norm": 0.2018793523311615, + "learning_rate": 0.0005, + "loss": 1.2313, + "step": 4078 + }, + { + "epoch": 3.2579872204472844, + "grad_norm": 0.26145434379577637, + "learning_rate": 0.0005, + "loss": 1.218, + "step": 4079 + }, + { + "epoch": 3.258785942492013, + "grad_norm": 0.16941657662391663, + "learning_rate": 0.0005, + "loss": 1.2211, + "step": 4080 + }, + { + "epoch": 3.2595846645367414, + "grad_norm": 0.3158339262008667, + "learning_rate": 0.0005, + "loss": 1.2192, + "step": 4081 + }, + { + "epoch": 3.2603833865814695, + "grad_norm": 0.18630816042423248, + "learning_rate": 0.0005, + "loss": 1.2091, + "step": 4082 + }, + { + "epoch": 3.261182108626198, + "grad_norm": 0.19504855573177338, + "learning_rate": 0.0005, + "loss": 1.2047, + "step": 4083 + }, + { + "epoch": 3.2619808306709266, + "grad_norm": 0.19672146439552307, + "learning_rate": 0.0005, + "loss": 1.2022, + "step": 4084 + }, + { + "epoch": 3.262779552715655, + "grad_norm": 0.15959087014198303, + "learning_rate": 0.0005, + "loss": 1.1957, + "step": 4085 + }, + { + "epoch": 3.263578274760383, + "grad_norm": 0.18326745927333832, + "learning_rate": 0.0005, + "loss": 1.1835, + "step": 4086 + }, + { + "epoch": 3.2643769968051117, + "grad_norm": 0.23495830595493317, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 4087 + }, + { + "epoch": 3.2651757188498403, + "grad_norm": 0.22718247771263123, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 4088 + }, + { + "epoch": 3.265974440894569, + "grad_norm": 0.2913427948951721, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 4089 + }, + { + "epoch": 3.2667731629392973, + "grad_norm": 0.44531312584877014, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 4090 + }, + { + "epoch": 3.2675718849840254, + "grad_norm": 0.6265004277229309, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 4091 + }, + { + "epoch": 3.268370607028754, + "grad_norm": 0.6119574904441833, + "learning_rate": 0.0005, + "loss": 1.186, + "step": 4092 + }, + { + "epoch": 3.2691693290734825, + "grad_norm": 0.23989497125148773, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 4093 + }, + { + "epoch": 3.269968051118211, + "grad_norm": 0.266013503074646, + "learning_rate": 0.0005, + "loss": 1.1693, + "step": 4094 + }, + { + "epoch": 3.270766773162939, + "grad_norm": 0.2205667793750763, + "learning_rate": 0.0005, + "loss": 1.1627, + "step": 4095 + }, + { + "epoch": 3.2715654952076676, + "grad_norm": 0.4600715935230255, + "learning_rate": 0.0005, + "loss": 1.1566, + "step": 4096 + }, + { + "epoch": 3.272364217252396, + "grad_norm": 0.6725661754608154, + "learning_rate": 0.0005, + "loss": 1.1806, + "step": 4097 + }, + { + "epoch": 3.2731629392971247, + "grad_norm": 0.3836606442928314, + "learning_rate": 0.0005, + "loss": 1.1613, + "step": 4098 + }, + { + "epoch": 3.273961661341853, + "grad_norm": 0.3752588629722595, + "learning_rate": 0.0005, + "loss": 1.1639, + "step": 4099 + }, + { + "epoch": 3.2747603833865817, + "grad_norm": 0.3297381103038788, + "learning_rate": 0.0005, + "loss": 1.1488, + "step": 4100 + }, + { + "epoch": 3.27555910543131, + "grad_norm": 0.5899438858032227, + "learning_rate": 0.0005, + "loss": 1.1486, + "step": 4101 + }, + { + "epoch": 3.2763578274760383, + "grad_norm": 0.5899466872215271, + "learning_rate": 0.0005, + "loss": 1.1533, + "step": 4102 + }, + { + "epoch": 3.277156549520767, + "grad_norm": 0.2944958209991455, + "learning_rate": 0.0005, + "loss": 1.1517, + "step": 4103 + }, + { + "epoch": 3.2779552715654954, + "grad_norm": 0.5870373249053955, + "learning_rate": 0.0005, + "loss": 1.1484, + "step": 4104 + }, + { + "epoch": 3.2787539936102235, + "grad_norm": 0.25267326831817627, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 4105 + }, + { + "epoch": 3.279552715654952, + "grad_norm": 0.20602582395076752, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 4106 + }, + { + "epoch": 3.2803514376996805, + "grad_norm": 0.4151447117328644, + "learning_rate": 0.0005, + "loss": 1.1338, + "step": 4107 + }, + { + "epoch": 3.281150159744409, + "grad_norm": 0.6591519117355347, + "learning_rate": 0.0005, + "loss": 1.1395, + "step": 4108 + }, + { + "epoch": 3.2819488817891376, + "grad_norm": 0.48510807752609253, + "learning_rate": 0.0005, + "loss": 1.1496, + "step": 4109 + }, + { + "epoch": 3.2827476038338657, + "grad_norm": 0.27803128957748413, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 4110 + }, + { + "epoch": 3.283546325878594, + "grad_norm": 0.3939184546470642, + "learning_rate": 0.0005, + "loss": 1.141, + "step": 4111 + }, + { + "epoch": 3.2843450479233227, + "grad_norm": 0.18271984159946442, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 4112 + }, + { + "epoch": 3.2851437699680512, + "grad_norm": 0.19690747559070587, + "learning_rate": 0.0005, + "loss": 1.1286, + "step": 4113 + }, + { + "epoch": 3.2859424920127793, + "grad_norm": 0.22968755662441254, + "learning_rate": 0.0005, + "loss": 1.1316, + "step": 4114 + }, + { + "epoch": 3.286741214057508, + "grad_norm": 0.24908174574375153, + "learning_rate": 0.0005, + "loss": 1.1279, + "step": 4115 + }, + { + "epoch": 3.2875399361022364, + "grad_norm": 0.15813285112380981, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 4116 + }, + { + "epoch": 3.288338658146965, + "grad_norm": 0.1056000292301178, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 4117 + }, + { + "epoch": 3.2891373801916934, + "grad_norm": 0.19983351230621338, + "learning_rate": 0.0005, + "loss": 1.118, + "step": 4118 + }, + { + "epoch": 3.289936102236422, + "grad_norm": 0.13660027086734772, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 4119 + }, + { + "epoch": 3.29073482428115, + "grad_norm": 0.15008457005023956, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 4120 + }, + { + "epoch": 3.2915335463258786, + "grad_norm": 0.1475287824869156, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 4121 + }, + { + "epoch": 3.292332268370607, + "grad_norm": 0.10478811711072922, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 4122 + }, + { + "epoch": 3.2931309904153356, + "grad_norm": 0.1577034890651703, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 4123 + }, + { + "epoch": 3.2939297124600637, + "grad_norm": 0.1019970178604126, + "learning_rate": 0.0005, + "loss": 1.1117, + "step": 4124 + }, + { + "epoch": 3.2947284345047922, + "grad_norm": 0.09229713678359985, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 4125 + }, + { + "epoch": 3.2955271565495208, + "grad_norm": 0.10029986500740051, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 4126 + }, + { + "epoch": 3.2963258785942493, + "grad_norm": 0.14171569049358368, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4127 + }, + { + "epoch": 3.297124600638978, + "grad_norm": 0.17343609035015106, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 4128 + }, + { + "epoch": 3.297923322683706, + "grad_norm": 0.2738705277442932, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 4129 + }, + { + "epoch": 3.2987220447284344, + "grad_norm": 0.3518083691596985, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 4130 + }, + { + "epoch": 3.299520766773163, + "grad_norm": 0.16174353659152985, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 4131 + }, + { + "epoch": 3.3003194888178915, + "grad_norm": 0.24402645230293274, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4132 + }, + { + "epoch": 3.3011182108626196, + "grad_norm": 0.23362669348716736, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 4133 + }, + { + "epoch": 3.301916932907348, + "grad_norm": 0.1391523778438568, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 4134 + }, + { + "epoch": 3.3027156549520766, + "grad_norm": 0.1516295224428177, + "learning_rate": 0.0005, + "loss": 1.0968, + "step": 4135 + }, + { + "epoch": 3.303514376996805, + "grad_norm": 0.17463526129722595, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 4136 + }, + { + "epoch": 3.3043130990415337, + "grad_norm": 0.13717398047447205, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 4137 + }, + { + "epoch": 3.3051118210862622, + "grad_norm": 0.16802728176116943, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 4138 + }, + { + "epoch": 3.3059105431309903, + "grad_norm": 0.11959057301282883, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 4139 + }, + { + "epoch": 3.306709265175719, + "grad_norm": 0.07706355303525925, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4140 + }, + { + "epoch": 3.3075079872204474, + "grad_norm": 0.07729125767946243, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 4141 + }, + { + "epoch": 3.308306709265176, + "grad_norm": 0.08654871582984924, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4142 + }, + { + "epoch": 3.309105431309904, + "grad_norm": 0.11485479772090912, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 4143 + }, + { + "epoch": 3.3099041533546325, + "grad_norm": 0.10812658816576004, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4144 + }, + { + "epoch": 3.310702875399361, + "grad_norm": 0.08537860214710236, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4145 + }, + { + "epoch": 3.3115015974440896, + "grad_norm": 0.10628878325223923, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 4146 + }, + { + "epoch": 3.312300319488818, + "grad_norm": 0.14903275668621063, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 4147 + }, + { + "epoch": 3.313099041533546, + "grad_norm": 0.09670894593000412, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4148 + }, + { + "epoch": 3.3138977635782747, + "grad_norm": 0.10959025472402573, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 4149 + }, + { + "epoch": 3.3146964856230032, + "grad_norm": 0.10397703945636749, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 4150 + }, + { + "epoch": 3.3154952076677318, + "grad_norm": 0.07681623846292496, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 4151 + }, + { + "epoch": 3.31629392971246, + "grad_norm": 0.07938152551651001, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 4152 + }, + { + "epoch": 3.3170926517571884, + "grad_norm": 0.14678052067756653, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 4153 + }, + { + "epoch": 3.317891373801917, + "grad_norm": 0.15366105735301971, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4154 + }, + { + "epoch": 3.3186900958466454, + "grad_norm": 0.13449597358703613, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 4155 + }, + { + "epoch": 3.319488817891374, + "grad_norm": 0.0861068144440651, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 4156 + }, + { + "epoch": 3.3202875399361025, + "grad_norm": 0.0604286752641201, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 4157 + }, + { + "epoch": 3.3210862619808306, + "grad_norm": 0.08299542963504791, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4158 + }, + { + "epoch": 3.321884984025559, + "grad_norm": 0.0738200917840004, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 4159 + }, + { + "epoch": 3.3226837060702876, + "grad_norm": 0.06450676172971725, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4160 + }, + { + "epoch": 3.323482428115016, + "grad_norm": 0.055281370878219604, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4161 + }, + { + "epoch": 3.3242811501597442, + "grad_norm": 0.09895910322666168, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4162 + }, + { + "epoch": 3.3250798722044728, + "grad_norm": 0.10338333994150162, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 4163 + }, + { + "epoch": 3.3258785942492013, + "grad_norm": 0.08346354216337204, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 4164 + }, + { + "epoch": 3.32667731629393, + "grad_norm": 0.15257857739925385, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 4165 + }, + { + "epoch": 3.3274760383386583, + "grad_norm": 0.1782383918762207, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4166 + }, + { + "epoch": 3.3282747603833864, + "grad_norm": 0.09908363968133926, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4167 + }, + { + "epoch": 3.329073482428115, + "grad_norm": 0.18942143023014069, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 4168 + }, + { + "epoch": 3.3298722044728435, + "grad_norm": 0.21095149219036102, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4169 + }, + { + "epoch": 3.330670926517572, + "grad_norm": 0.11597894132137299, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4170 + }, + { + "epoch": 3.3314696485623, + "grad_norm": 0.20450811088085175, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 4171 + }, + { + "epoch": 3.3322683706070286, + "grad_norm": 0.1609300971031189, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4172 + }, + { + "epoch": 3.333067092651757, + "grad_norm": 0.14068877696990967, + "learning_rate": 0.0005, + "loss": 1.0835, + "step": 4173 + }, + { + "epoch": 3.3338658146964857, + "grad_norm": 0.11969266831874847, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 4174 + }, + { + "epoch": 3.334664536741214, + "grad_norm": 0.16986626386642456, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4175 + }, + { + "epoch": 3.3354632587859427, + "grad_norm": 0.2065591812133789, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4176 + }, + { + "epoch": 3.336261980830671, + "grad_norm": 0.23542748391628265, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 4177 + }, + { + "epoch": 3.3370607028753994, + "grad_norm": 0.20896919071674347, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 4178 + }, + { + "epoch": 3.337859424920128, + "grad_norm": 0.16446076333522797, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 4179 + }, + { + "epoch": 3.3386581469648564, + "grad_norm": 0.11143177002668381, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 4180 + }, + { + "epoch": 3.3394568690095845, + "grad_norm": 0.0866970345377922, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 4181 + }, + { + "epoch": 3.340255591054313, + "grad_norm": 0.14608244597911835, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 4182 + }, + { + "epoch": 3.3410543130990416, + "grad_norm": 0.06152384728193283, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4183 + }, + { + "epoch": 3.34185303514377, + "grad_norm": 0.14289656281471252, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4184 + }, + { + "epoch": 3.3426517571884986, + "grad_norm": 0.16735558211803436, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4185 + }, + { + "epoch": 3.3434504792332267, + "grad_norm": 0.09012678265571594, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 4186 + }, + { + "epoch": 3.344249201277955, + "grad_norm": 0.05861378088593483, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4187 + }, + { + "epoch": 3.3450479233226837, + "grad_norm": 0.07123090326786041, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4188 + }, + { + "epoch": 3.3458466453674123, + "grad_norm": 0.07879375666379929, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4189 + }, + { + "epoch": 3.3466453674121404, + "grad_norm": 0.0925324484705925, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 4190 + }, + { + "epoch": 3.347444089456869, + "grad_norm": 0.0686444416642189, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4191 + }, + { + "epoch": 3.3482428115015974, + "grad_norm": 0.08633724600076675, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4192 + }, + { + "epoch": 3.349041533546326, + "grad_norm": 0.056881021708250046, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4193 + }, + { + "epoch": 3.3498402555910545, + "grad_norm": 0.07752947509288788, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4194 + }, + { + "epoch": 3.3506389776357826, + "grad_norm": 0.0927717313170433, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 4195 + }, + { + "epoch": 3.351437699680511, + "grad_norm": 0.09599179029464722, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 4196 + }, + { + "epoch": 3.3522364217252396, + "grad_norm": 0.09090889245271683, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 4197 + }, + { + "epoch": 3.353035143769968, + "grad_norm": 0.12757429480552673, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 4198 + }, + { + "epoch": 3.3538338658146967, + "grad_norm": 0.15210460126399994, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4199 + }, + { + "epoch": 3.3546325878594248, + "grad_norm": 0.10982836782932281, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4200 + }, + { + "epoch": 3.3554313099041533, + "grad_norm": 0.056641776114702225, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4201 + }, + { + "epoch": 3.356230031948882, + "grad_norm": 0.09506776928901672, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4202 + }, + { + "epoch": 3.3570287539936103, + "grad_norm": 0.12064918130636215, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4203 + }, + { + "epoch": 3.357827476038339, + "grad_norm": 0.12343298643827438, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 4204 + }, + { + "epoch": 3.358626198083067, + "grad_norm": 0.11508476734161377, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4205 + }, + { + "epoch": 3.3594249201277955, + "grad_norm": 0.07552453875541687, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4206 + }, + { + "epoch": 3.360223642172524, + "grad_norm": 0.10495936870574951, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 4207 + }, + { + "epoch": 3.3610223642172525, + "grad_norm": 0.13230633735656738, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4208 + }, + { + "epoch": 3.3618210862619806, + "grad_norm": 0.13003787398338318, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4209 + }, + { + "epoch": 3.362619808306709, + "grad_norm": 0.09252234548330307, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 4210 + }, + { + "epoch": 3.3634185303514377, + "grad_norm": 0.07739317417144775, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 4211 + }, + { + "epoch": 3.364217252396166, + "grad_norm": 0.12185318768024445, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4212 + }, + { + "epoch": 3.3650159744408947, + "grad_norm": 0.17643119394779205, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4213 + }, + { + "epoch": 3.365814696485623, + "grad_norm": 0.10462872684001923, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4214 + }, + { + "epoch": 3.3666134185303513, + "grad_norm": 0.1486569344997406, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 4215 + }, + { + "epoch": 3.36741214057508, + "grad_norm": 0.11858930438756943, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4216 + }, + { + "epoch": 3.3682108626198084, + "grad_norm": 0.07907772809267044, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4217 + }, + { + "epoch": 3.369009584664537, + "grad_norm": 0.5416387319564819, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 4218 + }, + { + "epoch": 3.369808306709265, + "grad_norm": 0.08767322450876236, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4219 + }, + { + "epoch": 3.3706070287539935, + "grad_norm": 0.09651107341051102, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4220 + }, + { + "epoch": 3.371405750798722, + "grad_norm": 0.07548791915178299, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4221 + }, + { + "epoch": 3.3722044728434506, + "grad_norm": 0.09317605197429657, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4222 + }, + { + "epoch": 3.373003194888179, + "grad_norm": 0.07431582361459732, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4223 + }, + { + "epoch": 3.373801916932907, + "grad_norm": 0.12754018604755402, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4224 + }, + { + "epoch": 3.3746006389776357, + "grad_norm": 0.12697845697402954, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4225 + }, + { + "epoch": 3.3753993610223643, + "grad_norm": 0.21522995829582214, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4226 + }, + { + "epoch": 3.376198083067093, + "grad_norm": 0.08886270225048065, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 4227 + }, + { + "epoch": 3.376996805111821, + "grad_norm": 0.07107655704021454, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4228 + }, + { + "epoch": 3.3777955271565494, + "grad_norm": 0.07452798634767532, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 4229 + }, + { + "epoch": 3.378594249201278, + "grad_norm": 0.10205573588609695, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4230 + }, + { + "epoch": 3.3793929712460065, + "grad_norm": 0.10990341752767563, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4231 + }, + { + "epoch": 3.380191693290735, + "grad_norm": 0.08567643165588379, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4232 + }, + { + "epoch": 3.380990415335463, + "grad_norm": 0.057073548436164856, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4233 + }, + { + "epoch": 3.3817891373801916, + "grad_norm": 0.12602978944778442, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4234 + }, + { + "epoch": 3.38258785942492, + "grad_norm": 0.1715400218963623, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4235 + }, + { + "epoch": 3.3833865814696487, + "grad_norm": 0.13129903376102448, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 4236 + }, + { + "epoch": 3.384185303514377, + "grad_norm": 0.1308225691318512, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4237 + }, + { + "epoch": 3.3849840255591053, + "grad_norm": 0.1353990137577057, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4238 + }, + { + "epoch": 3.385782747603834, + "grad_norm": 0.08648121356964111, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4239 + }, + { + "epoch": 3.3865814696485623, + "grad_norm": 0.23568236827850342, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4240 + }, + { + "epoch": 3.387380191693291, + "grad_norm": 0.20514735579490662, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4241 + }, + { + "epoch": 3.3881789137380194, + "grad_norm": 0.10276424884796143, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4242 + }, + { + "epoch": 3.3889776357827475, + "grad_norm": 0.1838751584291458, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4243 + }, + { + "epoch": 3.389776357827476, + "grad_norm": 0.1697031557559967, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4244 + }, + { + "epoch": 3.3905750798722045, + "grad_norm": 0.11439084261655807, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4245 + }, + { + "epoch": 3.391373801916933, + "grad_norm": 0.14021249115467072, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4246 + }, + { + "epoch": 3.392172523961661, + "grad_norm": 0.13989558815956116, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4247 + }, + { + "epoch": 3.3929712460063897, + "grad_norm": 0.12039095908403397, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4248 + }, + { + "epoch": 3.393769968051118, + "grad_norm": 0.17901045083999634, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4249 + }, + { + "epoch": 3.3945686900958467, + "grad_norm": 0.1053776666522026, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 4250 + }, + { + "epoch": 3.3953674121405752, + "grad_norm": 1.7777512073516846, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4251 + }, + { + "epoch": 3.3961661341853033, + "grad_norm": 0.06677904725074768, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 4252 + }, + { + "epoch": 3.396964856230032, + "grad_norm": 0.16123540699481964, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4253 + }, + { + "epoch": 3.3977635782747604, + "grad_norm": 0.21530884504318237, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4254 + }, + { + "epoch": 3.398562300319489, + "grad_norm": 0.20979386568069458, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4255 + }, + { + "epoch": 3.3993610223642174, + "grad_norm": 0.14755229651927948, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 4256 + }, + { + "epoch": 3.4001597444089455, + "grad_norm": 0.10182930529117584, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4257 + }, + { + "epoch": 3.400958466453674, + "grad_norm": 0.11478064954280853, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4258 + }, + { + "epoch": 3.4017571884984026, + "grad_norm": 0.2052452266216278, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4259 + }, + { + "epoch": 3.402555910543131, + "grad_norm": 0.6292023062705994, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4260 + }, + { + "epoch": 3.4033546325878596, + "grad_norm": 0.0666726678609848, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4261 + }, + { + "epoch": 3.4041533546325877, + "grad_norm": 0.11848346143960953, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4262 + }, + { + "epoch": 3.4049520766773163, + "grad_norm": 0.15276756882667542, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4263 + }, + { + "epoch": 3.405750798722045, + "grad_norm": 0.08534786105155945, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4264 + }, + { + "epoch": 3.4065495207667733, + "grad_norm": 0.07453266531229019, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4265 + }, + { + "epoch": 3.4073482428115014, + "grad_norm": 0.12894752621650696, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4266 + }, + { + "epoch": 3.40814696485623, + "grad_norm": 0.11341612786054611, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4267 + }, + { + "epoch": 3.4089456869009584, + "grad_norm": 0.06551265716552734, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4268 + }, + { + "epoch": 3.409744408945687, + "grad_norm": 0.08828622102737427, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4269 + }, + { + "epoch": 3.4105431309904155, + "grad_norm": 0.06951884925365448, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4270 + }, + { + "epoch": 3.4113418530351436, + "grad_norm": 0.0785432904958725, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4271 + }, + { + "epoch": 3.412140575079872, + "grad_norm": 0.06681766360998154, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4272 + }, + { + "epoch": 3.4129392971246006, + "grad_norm": 0.060111526399850845, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4273 + }, + { + "epoch": 3.413738019169329, + "grad_norm": 0.07451382279396057, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 4274 + }, + { + "epoch": 3.4145367412140573, + "grad_norm": 0.08646225184202194, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4275 + }, + { + "epoch": 3.415335463258786, + "grad_norm": 0.07061789929866791, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4276 + }, + { + "epoch": 3.4161341853035143, + "grad_norm": 0.09554821997880936, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4277 + }, + { + "epoch": 3.416932907348243, + "grad_norm": 0.11288002133369446, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4278 + }, + { + "epoch": 3.4177316293929714, + "grad_norm": 0.10565607994794846, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4279 + }, + { + "epoch": 3.4185303514377, + "grad_norm": 0.08235503733158112, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4280 + }, + { + "epoch": 3.419329073482428, + "grad_norm": 0.1302265226840973, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4281 + }, + { + "epoch": 3.4201277955271565, + "grad_norm": 0.07910848408937454, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4282 + }, + { + "epoch": 3.420926517571885, + "grad_norm": 0.10624215006828308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4283 + }, + { + "epoch": 3.4217252396166136, + "grad_norm": 0.08545158058404922, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4284 + }, + { + "epoch": 3.4225239616613417, + "grad_norm": 0.07010428607463837, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4285 + }, + { + "epoch": 3.42332268370607, + "grad_norm": 0.08256867527961731, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4286 + }, + { + "epoch": 3.4241214057507987, + "grad_norm": 0.13074247539043427, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4287 + }, + { + "epoch": 3.4249201277955272, + "grad_norm": 0.18332679569721222, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4288 + }, + { + "epoch": 3.4257188498402558, + "grad_norm": 0.1671689748764038, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4289 + }, + { + "epoch": 3.426517571884984, + "grad_norm": 0.10386296361684799, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4290 + }, + { + "epoch": 3.4273162939297124, + "grad_norm": 0.07554108649492264, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 4291 + }, + { + "epoch": 3.428115015974441, + "grad_norm": 0.1138196587562561, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4292 + }, + { + "epoch": 3.4289137380191694, + "grad_norm": 0.1681462526321411, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4293 + }, + { + "epoch": 3.4297124600638975, + "grad_norm": 0.1833198368549347, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4294 + }, + { + "epoch": 3.430511182108626, + "grad_norm": 0.10269228368997574, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4295 + }, + { + "epoch": 3.4313099041533546, + "grad_norm": 0.08876223117113113, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4296 + }, + { + "epoch": 3.432108626198083, + "grad_norm": 0.21489253640174866, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4297 + }, + { + "epoch": 3.4329073482428116, + "grad_norm": 0.22669701278209686, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4298 + }, + { + "epoch": 3.43370607028754, + "grad_norm": 0.16946858167648315, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4299 + }, + { + "epoch": 3.4345047923322682, + "grad_norm": 0.05162649229168892, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4300 + }, + { + "epoch": 3.4353035143769968, + "grad_norm": 0.09700657427310944, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4301 + }, + { + "epoch": 3.4361022364217253, + "grad_norm": 0.14858263731002808, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4302 + }, + { + "epoch": 3.436900958466454, + "grad_norm": 0.16938818991184235, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4303 + }, + { + "epoch": 3.437699680511182, + "grad_norm": 0.13441702723503113, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4304 + }, + { + "epoch": 3.4384984025559104, + "grad_norm": 0.07661818712949753, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4305 + }, + { + "epoch": 3.439297124600639, + "grad_norm": 0.19436489045619965, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4306 + }, + { + "epoch": 3.4400958466453675, + "grad_norm": 0.20447906851768494, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4307 + }, + { + "epoch": 3.440894568690096, + "grad_norm": 0.1414622664451599, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4308 + }, + { + "epoch": 3.441693290734824, + "grad_norm": 0.06289447098970413, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4309 + }, + { + "epoch": 3.4424920127795526, + "grad_norm": 0.0966482162475586, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4310 + }, + { + "epoch": 3.443290734824281, + "grad_norm": 0.1300116777420044, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4311 + }, + { + "epoch": 3.4440894568690097, + "grad_norm": 0.11638098210096359, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4312 + }, + { + "epoch": 3.4448881789137378, + "grad_norm": 0.08284632116556168, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4313 + }, + { + "epoch": 3.4456869009584663, + "grad_norm": 0.0617060512304306, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4314 + }, + { + "epoch": 3.446485623003195, + "grad_norm": 0.12798283994197845, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4315 + }, + { + "epoch": 3.4472843450479234, + "grad_norm": 0.12712593376636505, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4316 + }, + { + "epoch": 3.448083067092652, + "grad_norm": 0.09164460003376007, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4317 + }, + { + "epoch": 3.4488817891373804, + "grad_norm": 0.07618964463472366, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4318 + }, + { + "epoch": 3.4496805111821085, + "grad_norm": 0.07986288517713547, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4319 + }, + { + "epoch": 3.450479233226837, + "grad_norm": 0.0783228650689125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4320 + }, + { + "epoch": 3.4512779552715656, + "grad_norm": 0.09899114072322845, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4321 + }, + { + "epoch": 3.452076677316294, + "grad_norm": 0.13710227608680725, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4322 + }, + { + "epoch": 3.452875399361022, + "grad_norm": 0.1281789392232895, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4323 + }, + { + "epoch": 3.4536741214057507, + "grad_norm": 0.11021110415458679, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4324 + }, + { + "epoch": 3.4544728434504792, + "grad_norm": 0.11450989544391632, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4325 + }, + { + "epoch": 3.4552715654952078, + "grad_norm": 0.09010434150695801, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4326 + }, + { + "epoch": 3.4560702875399363, + "grad_norm": 0.08817321807146072, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4327 + }, + { + "epoch": 3.4568690095846644, + "grad_norm": 0.06502921879291534, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4328 + }, + { + "epoch": 3.457667731629393, + "grad_norm": 0.13399769365787506, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4329 + }, + { + "epoch": 3.4584664536741214, + "grad_norm": 0.19785602390766144, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4330 + }, + { + "epoch": 3.45926517571885, + "grad_norm": 0.15761834383010864, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4331 + }, + { + "epoch": 3.460063897763578, + "grad_norm": 0.11824636161327362, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4332 + }, + { + "epoch": 3.4608626198083066, + "grad_norm": 0.07031631469726562, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4333 + }, + { + "epoch": 3.461661341853035, + "grad_norm": 0.09940601140260696, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4334 + }, + { + "epoch": 3.4624600638977636, + "grad_norm": 0.11931589990854263, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4335 + }, + { + "epoch": 3.463258785942492, + "grad_norm": 0.0967932790517807, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4336 + }, + { + "epoch": 3.4640575079872207, + "grad_norm": 0.09523937106132507, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4337 + }, + { + "epoch": 3.4648562300319488, + "grad_norm": 0.09964902698993683, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4338 + }, + { + "epoch": 3.4656549520766773, + "grad_norm": 0.09898022562265396, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4339 + }, + { + "epoch": 3.466453674121406, + "grad_norm": 0.05388521030545235, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4340 + }, + { + "epoch": 3.4672523961661343, + "grad_norm": 0.06455415487289429, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4341 + }, + { + "epoch": 3.4680511182108624, + "grad_norm": 0.05497310310602188, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4342 + }, + { + "epoch": 3.468849840255591, + "grad_norm": 0.049679841846227646, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4343 + }, + { + "epoch": 3.4696485623003195, + "grad_norm": 0.05664939060807228, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4344 + }, + { + "epoch": 3.470447284345048, + "grad_norm": 0.06651245057582855, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4345 + }, + { + "epoch": 3.4712460063897765, + "grad_norm": 0.08480475097894669, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4346 + }, + { + "epoch": 3.4720447284345046, + "grad_norm": 0.07331875711679459, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4347 + }, + { + "epoch": 3.472843450479233, + "grad_norm": 0.0505477711558342, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4348 + }, + { + "epoch": 3.4736421725239617, + "grad_norm": 0.06969176232814789, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4349 + }, + { + "epoch": 3.47444089456869, + "grad_norm": 0.08915391564369202, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4350 + }, + { + "epoch": 3.4752396166134183, + "grad_norm": 0.09378752112388611, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4351 + }, + { + "epoch": 3.476038338658147, + "grad_norm": 0.059195373207330704, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4352 + }, + { + "epoch": 3.4768370607028753, + "grad_norm": 0.07094884663820267, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4353 + }, + { + "epoch": 3.477635782747604, + "grad_norm": 0.11091995984315872, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4354 + }, + { + "epoch": 3.4784345047923324, + "grad_norm": 0.14018885791301727, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4355 + }, + { + "epoch": 3.479233226837061, + "grad_norm": 0.13553708791732788, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4356 + }, + { + "epoch": 3.480031948881789, + "grad_norm": 0.08005240559577942, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4357 + }, + { + "epoch": 3.4808306709265175, + "grad_norm": 0.05309261009097099, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4358 + }, + { + "epoch": 3.481629392971246, + "grad_norm": 0.09956394135951996, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4359 + }, + { + "epoch": 3.4824281150159746, + "grad_norm": 0.13189470767974854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4360 + }, + { + "epoch": 3.4832268370607027, + "grad_norm": 0.13651393353939056, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4361 + }, + { + "epoch": 3.484025559105431, + "grad_norm": 0.12467528879642487, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4362 + }, + { + "epoch": 3.4848242811501597, + "grad_norm": 0.11428561061620712, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4363 + }, + { + "epoch": 3.4856230031948883, + "grad_norm": 0.12095288187265396, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4364 + }, + { + "epoch": 3.486421725239617, + "grad_norm": 0.05889631807804108, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4365 + }, + { + "epoch": 3.487220447284345, + "grad_norm": 0.1158040463924408, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4366 + }, + { + "epoch": 3.4880191693290734, + "grad_norm": 0.11070148646831512, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4367 + }, + { + "epoch": 3.488817891373802, + "grad_norm": 0.0625298023223877, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4368 + }, + { + "epoch": 3.4896166134185305, + "grad_norm": 0.11865562945604324, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4369 + }, + { + "epoch": 3.4904153354632586, + "grad_norm": 0.12237154692411423, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4370 + }, + { + "epoch": 3.491214057507987, + "grad_norm": 0.05703050270676613, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4371 + }, + { + "epoch": 3.4920127795527156, + "grad_norm": 0.17314022779464722, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4372 + }, + { + "epoch": 3.492811501597444, + "grad_norm": 0.2984711825847626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4373 + }, + { + "epoch": 3.4936102236421727, + "grad_norm": 0.30129608511924744, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4374 + }, + { + "epoch": 3.494408945686901, + "grad_norm": 0.12154170870780945, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4375 + }, + { + "epoch": 3.4952076677316293, + "grad_norm": 0.12467148154973984, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4376 + }, + { + "epoch": 3.496006389776358, + "grad_norm": 0.23285721242427826, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4377 + }, + { + "epoch": 3.4968051118210863, + "grad_norm": 0.20723310112953186, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4378 + }, + { + "epoch": 3.497603833865815, + "grad_norm": 0.13221028447151184, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4379 + }, + { + "epoch": 3.498402555910543, + "grad_norm": 0.06008061394095421, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4380 + }, + { + "epoch": 3.4992012779552715, + "grad_norm": 0.12877988815307617, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4381 + }, + { + "epoch": 3.5, + "grad_norm": 0.1951032429933548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4382 + }, + { + "epoch": 3.5007987220447285, + "grad_norm": 0.13804258406162262, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4383 + }, + { + "epoch": 3.501597444089457, + "grad_norm": 0.06761720031499863, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4384 + }, + { + "epoch": 3.502396166134185, + "grad_norm": 0.13217084109783173, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4385 + }, + { + "epoch": 3.5031948881789137, + "grad_norm": 0.11773377656936646, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4386 + }, + { + "epoch": 3.503993610223642, + "grad_norm": 0.07580399513244629, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4387 + }, + { + "epoch": 3.5047923322683707, + "grad_norm": 0.1739586442708969, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4388 + }, + { + "epoch": 3.505591054313099, + "grad_norm": 0.14863203465938568, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4389 + }, + { + "epoch": 3.5063897763578273, + "grad_norm": 0.07858511805534363, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4390 + }, + { + "epoch": 3.507188498402556, + "grad_norm": 0.15966418385505676, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4391 + }, + { + "epoch": 3.5079872204472844, + "grad_norm": 0.28761810064315796, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4392 + }, + { + "epoch": 3.508785942492013, + "grad_norm": 0.24169668555259705, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4393 + }, + { + "epoch": 3.5095846645367414, + "grad_norm": 0.07907059788703918, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4394 + }, + { + "epoch": 3.5103833865814695, + "grad_norm": 0.20243291556835175, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4395 + }, + { + "epoch": 3.511182108626198, + "grad_norm": 0.302198588848114, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4396 + }, + { + "epoch": 3.5119808306709266, + "grad_norm": 0.2544843554496765, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4397 + }, + { + "epoch": 3.512779552715655, + "grad_norm": 0.07381684333086014, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4398 + }, + { + "epoch": 3.513578274760383, + "grad_norm": 0.17388348281383514, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4399 + }, + { + "epoch": 3.5143769968051117, + "grad_norm": 0.2293306440114975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4400 + }, + { + "epoch": 3.5151757188498403, + "grad_norm": 0.07548263669013977, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4401 + }, + { + "epoch": 3.515974440894569, + "grad_norm": 0.1924273669719696, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4402 + }, + { + "epoch": 3.5167731629392973, + "grad_norm": 0.26867300271987915, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4403 + }, + { + "epoch": 3.5175718849840254, + "grad_norm": 0.14461541175842285, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4404 + }, + { + "epoch": 3.518370607028754, + "grad_norm": 0.12608370184898376, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4405 + }, + { + "epoch": 3.5191693290734825, + "grad_norm": 0.20579756796360016, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4406 + }, + { + "epoch": 3.519968051118211, + "grad_norm": 0.12286399304866791, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4407 + }, + { + "epoch": 3.520766773162939, + "grad_norm": 0.055247388780117035, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4408 + }, + { + "epoch": 3.5215654952076676, + "grad_norm": 0.07877562195062637, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4409 + }, + { + "epoch": 3.522364217252396, + "grad_norm": 0.0769568607211113, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4410 + }, + { + "epoch": 3.5231629392971247, + "grad_norm": 0.0898609384894371, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4411 + }, + { + "epoch": 3.523961661341853, + "grad_norm": 0.057637594640254974, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4412 + }, + { + "epoch": 3.5247603833865817, + "grad_norm": 0.12046241015195847, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4413 + }, + { + "epoch": 3.52555910543131, + "grad_norm": 0.09949496388435364, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4414 + }, + { + "epoch": 3.5263578274760383, + "grad_norm": 0.054411277174949646, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4415 + }, + { + "epoch": 3.527156549520767, + "grad_norm": 0.08293551951646805, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4416 + }, + { + "epoch": 3.527955271565495, + "grad_norm": 0.07669435441493988, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4417 + }, + { + "epoch": 3.5287539936102235, + "grad_norm": 0.06382326781749725, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4418 + }, + { + "epoch": 3.529552715654952, + "grad_norm": 0.07673322409391403, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4419 + }, + { + "epoch": 3.5303514376996805, + "grad_norm": 0.08052650839090347, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4420 + }, + { + "epoch": 3.531150159744409, + "grad_norm": 0.1354246884584427, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4421 + }, + { + "epoch": 3.5319488817891376, + "grad_norm": 0.07951574772596359, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 4422 + }, + { + "epoch": 3.5327476038338657, + "grad_norm": 0.11002526432275772, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4423 + }, + { + "epoch": 3.533546325878594, + "grad_norm": 0.18597234785556793, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4424 + }, + { + "epoch": 3.5343450479233227, + "grad_norm": 0.12601099908351898, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4425 + }, + { + "epoch": 3.5351437699680512, + "grad_norm": 0.11181886494159698, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4426 + }, + { + "epoch": 3.5359424920127793, + "grad_norm": 0.11489108949899673, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4427 + }, + { + "epoch": 3.536741214057508, + "grad_norm": 0.10422708839178085, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4428 + }, + { + "epoch": 3.5375399361022364, + "grad_norm": 0.13701972365379333, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4429 + }, + { + "epoch": 3.538338658146965, + "grad_norm": 0.10713281482458115, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4430 + }, + { + "epoch": 3.5391373801916934, + "grad_norm": 0.11508526653051376, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4431 + }, + { + "epoch": 3.539936102236422, + "grad_norm": 0.061856236308813095, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4432 + }, + { + "epoch": 3.54073482428115, + "grad_norm": 0.12080623209476471, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4433 + }, + { + "epoch": 3.5415335463258786, + "grad_norm": 0.12233573198318481, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4434 + }, + { + "epoch": 3.542332268370607, + "grad_norm": 0.07041362673044205, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4435 + }, + { + "epoch": 3.543130990415335, + "grad_norm": 0.1162526085972786, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4436 + }, + { + "epoch": 3.5439297124600637, + "grad_norm": 0.12962234020233154, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4437 + }, + { + "epoch": 3.5447284345047922, + "grad_norm": 0.1368536353111267, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4438 + }, + { + "epoch": 3.5455271565495208, + "grad_norm": 0.061806995421648026, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4439 + }, + { + "epoch": 3.5463258785942493, + "grad_norm": 0.11016163975000381, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4440 + }, + { + "epoch": 3.547124600638978, + "grad_norm": 0.0992715135216713, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4441 + }, + { + "epoch": 3.547923322683706, + "grad_norm": 0.14015190303325653, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4442 + }, + { + "epoch": 3.5487220447284344, + "grad_norm": 0.07255455106496811, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4443 + }, + { + "epoch": 3.549520766773163, + "grad_norm": 0.13293872773647308, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4444 + }, + { + "epoch": 3.5503194888178915, + "grad_norm": 0.08923539519309998, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4445 + }, + { + "epoch": 3.5511182108626196, + "grad_norm": 0.10125918686389923, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4446 + }, + { + "epoch": 3.551916932907348, + "grad_norm": 0.12369748950004578, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4447 + }, + { + "epoch": 3.5527156549520766, + "grad_norm": 0.14656996726989746, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4448 + }, + { + "epoch": 3.553514376996805, + "grad_norm": 0.14212539792060852, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4449 + }, + { + "epoch": 3.5543130990415337, + "grad_norm": 0.08640166372060776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4450 + }, + { + "epoch": 3.5551118210862622, + "grad_norm": 0.05552735924720764, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4451 + }, + { + "epoch": 3.5559105431309903, + "grad_norm": 0.12888140976428986, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4452 + }, + { + "epoch": 3.556709265175719, + "grad_norm": 0.10696940869092941, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4453 + }, + { + "epoch": 3.5575079872204474, + "grad_norm": 0.06578963249921799, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4454 + }, + { + "epoch": 3.5583067092651754, + "grad_norm": 0.16173291206359863, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4455 + }, + { + "epoch": 3.559105431309904, + "grad_norm": 0.1550486832857132, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4456 + }, + { + "epoch": 3.5599041533546325, + "grad_norm": 0.14084209501743317, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4457 + }, + { + "epoch": 3.560702875399361, + "grad_norm": 0.12024512141942978, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4458 + }, + { + "epoch": 3.5615015974440896, + "grad_norm": 0.12514936923980713, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4459 + }, + { + "epoch": 3.562300319488818, + "grad_norm": 0.16444219648838043, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4460 + }, + { + "epoch": 3.563099041533546, + "grad_norm": 0.11520830541849136, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4461 + }, + { + "epoch": 3.5638977635782747, + "grad_norm": 0.07884586602449417, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4462 + }, + { + "epoch": 3.5646964856230032, + "grad_norm": 0.1655684858560562, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4463 + }, + { + "epoch": 3.5654952076677318, + "grad_norm": 0.15222500264644623, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4464 + }, + { + "epoch": 3.56629392971246, + "grad_norm": 0.06106618419289589, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4465 + }, + { + "epoch": 3.5670926517571884, + "grad_norm": 0.10545333474874496, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4466 + }, + { + "epoch": 3.567891373801917, + "grad_norm": 0.1353088915348053, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4467 + }, + { + "epoch": 3.5686900958466454, + "grad_norm": 0.11200091242790222, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4468 + }, + { + "epoch": 3.569488817891374, + "grad_norm": 0.052965741604566574, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4469 + }, + { + "epoch": 3.5702875399361025, + "grad_norm": 0.1244843453168869, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4470 + }, + { + "epoch": 3.5710862619808306, + "grad_norm": 0.1160016730427742, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4471 + }, + { + "epoch": 3.571884984025559, + "grad_norm": 0.04874402657151222, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4472 + }, + { + "epoch": 3.5726837060702876, + "grad_norm": 0.14222301542758942, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4473 + }, + { + "epoch": 3.5734824281150157, + "grad_norm": 0.1190859004855156, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4474 + }, + { + "epoch": 3.5742811501597442, + "grad_norm": 0.0659632682800293, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4475 + }, + { + "epoch": 3.5750798722044728, + "grad_norm": 0.07350483536720276, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4476 + }, + { + "epoch": 3.5758785942492013, + "grad_norm": 0.1220504492521286, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4477 + }, + { + "epoch": 3.57667731629393, + "grad_norm": 0.08952966332435608, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4478 + }, + { + "epoch": 3.5774760383386583, + "grad_norm": 0.08828000724315643, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4479 + }, + { + "epoch": 3.5782747603833864, + "grad_norm": 0.14621564745903015, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4480 + }, + { + "epoch": 3.579073482428115, + "grad_norm": 0.13653770089149475, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4481 + }, + { + "epoch": 3.5798722044728435, + "grad_norm": 0.0682564228773117, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4482 + }, + { + "epoch": 3.580670926517572, + "grad_norm": 0.06511309742927551, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4483 + }, + { + "epoch": 3.5814696485623, + "grad_norm": 0.08800239861011505, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4484 + }, + { + "epoch": 3.5822683706070286, + "grad_norm": 0.06488335877656937, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4485 + }, + { + "epoch": 3.583067092651757, + "grad_norm": 0.06505738198757172, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4486 + }, + { + "epoch": 3.5838658146964857, + "grad_norm": 0.07395542412996292, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4487 + }, + { + "epoch": 3.584664536741214, + "grad_norm": 0.06717971712350845, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4488 + }, + { + "epoch": 3.5854632587859427, + "grad_norm": 0.056708067655563354, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4489 + }, + { + "epoch": 3.586261980830671, + "grad_norm": 0.06316737830638885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4490 + }, + { + "epoch": 3.5870607028753994, + "grad_norm": 0.06079665198922157, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4491 + }, + { + "epoch": 3.587859424920128, + "grad_norm": 0.1293981820344925, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4492 + }, + { + "epoch": 3.588658146964856, + "grad_norm": 0.08021418750286102, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4493 + }, + { + "epoch": 3.5894568690095845, + "grad_norm": 0.096865214407444, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4494 + }, + { + "epoch": 3.590255591054313, + "grad_norm": 0.06794966757297516, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4495 + }, + { + "epoch": 3.5910543130990416, + "grad_norm": 0.04527222737669945, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4496 + }, + { + "epoch": 3.59185303514377, + "grad_norm": 0.07153941690921783, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4497 + }, + { + "epoch": 3.5926517571884986, + "grad_norm": 0.07480445504188538, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4498 + }, + { + "epoch": 3.5934504792332267, + "grad_norm": 0.09161835163831711, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4499 + }, + { + "epoch": 3.594249201277955, + "grad_norm": 0.08420681953430176, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4500 + }, + { + "epoch": 3.5950479233226837, + "grad_norm": 0.04745415225625038, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4501 + }, + { + "epoch": 3.5958466453674123, + "grad_norm": 0.061325494199991226, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4502 + }, + { + "epoch": 3.5966453674121404, + "grad_norm": 0.08550430834293365, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4503 + }, + { + "epoch": 3.597444089456869, + "grad_norm": 0.09530419111251831, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4504 + }, + { + "epoch": 3.5982428115015974, + "grad_norm": 0.10484769195318222, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4505 + }, + { + "epoch": 3.599041533546326, + "grad_norm": 0.08398665487766266, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4506 + }, + { + "epoch": 3.5998402555910545, + "grad_norm": 0.1644149124622345, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4507 + }, + { + "epoch": 3.600638977635783, + "grad_norm": 0.0803244560956955, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4508 + }, + { + "epoch": 3.601437699680511, + "grad_norm": 0.12512895464897156, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4509 + }, + { + "epoch": 3.6022364217252396, + "grad_norm": 0.1404576301574707, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4510 + }, + { + "epoch": 3.603035143769968, + "grad_norm": 0.10823316127061844, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4511 + }, + { + "epoch": 3.6038338658146962, + "grad_norm": 0.06985688954591751, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4512 + }, + { + "epoch": 3.6046325878594248, + "grad_norm": 0.1651264876127243, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4513 + }, + { + "epoch": 3.6054313099041533, + "grad_norm": 0.19752484560012817, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4514 + }, + { + "epoch": 3.606230031948882, + "grad_norm": 0.20005464553833008, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4515 + }, + { + "epoch": 3.6070287539936103, + "grad_norm": 0.1478145569562912, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4516 + }, + { + "epoch": 3.607827476038339, + "grad_norm": 0.05737901106476784, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4517 + }, + { + "epoch": 3.608626198083067, + "grad_norm": 0.16174650192260742, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4518 + }, + { + "epoch": 3.6094249201277955, + "grad_norm": 0.1959141194820404, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4519 + }, + { + "epoch": 3.610223642172524, + "grad_norm": 0.09767267853021622, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4520 + }, + { + "epoch": 3.6110223642172525, + "grad_norm": 0.10553760081529617, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4521 + }, + { + "epoch": 3.6118210862619806, + "grad_norm": 0.19380977749824524, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4522 + }, + { + "epoch": 3.612619808306709, + "grad_norm": 0.2024526745080948, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4523 + }, + { + "epoch": 3.6134185303514377, + "grad_norm": 0.09705837070941925, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4524 + }, + { + "epoch": 3.614217252396166, + "grad_norm": 0.12530986964702606, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4525 + }, + { + "epoch": 3.6150159744408947, + "grad_norm": 0.20901283621788025, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4526 + }, + { + "epoch": 3.6158146964856233, + "grad_norm": 0.16532309353351593, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4527 + }, + { + "epoch": 3.6166134185303513, + "grad_norm": 0.18353991210460663, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4528 + }, + { + "epoch": 3.61741214057508, + "grad_norm": 0.12912365794181824, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4529 + }, + { + "epoch": 3.6182108626198084, + "grad_norm": 0.2052653580904007, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4530 + }, + { + "epoch": 3.6190095846645365, + "grad_norm": 0.1395503133535385, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4531 + }, + { + "epoch": 3.619808306709265, + "grad_norm": 0.07939961552619934, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4532 + }, + { + "epoch": 3.6206070287539935, + "grad_norm": 0.10098318755626678, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4533 + }, + { + "epoch": 3.621405750798722, + "grad_norm": 0.14332561194896698, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4534 + }, + { + "epoch": 3.6222044728434506, + "grad_norm": 0.09697199612855911, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4535 + }, + { + "epoch": 3.623003194888179, + "grad_norm": 0.07785658538341522, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4536 + }, + { + "epoch": 3.623801916932907, + "grad_norm": 0.11263108998537064, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4537 + }, + { + "epoch": 3.6246006389776357, + "grad_norm": 0.18257030844688416, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4538 + }, + { + "epoch": 3.6253993610223643, + "grad_norm": 0.1456373631954193, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4539 + }, + { + "epoch": 3.626198083067093, + "grad_norm": 0.06831679493188858, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4540 + }, + { + "epoch": 3.626996805111821, + "grad_norm": 0.12324535846710205, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4541 + }, + { + "epoch": 3.6277955271565494, + "grad_norm": 0.15868282318115234, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4542 + }, + { + "epoch": 3.628594249201278, + "grad_norm": 0.09355167299509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4543 + }, + { + "epoch": 3.6293929712460065, + "grad_norm": 0.08047328144311905, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4544 + }, + { + "epoch": 3.630191693290735, + "grad_norm": 0.12683328986167908, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4545 + }, + { + "epoch": 3.6309904153354635, + "grad_norm": 0.11964920908212662, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4546 + }, + { + "epoch": 3.6317891373801916, + "grad_norm": 0.0504109226167202, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4547 + }, + { + "epoch": 3.63258785942492, + "grad_norm": 0.11909852921962738, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4548 + }, + { + "epoch": 3.6333865814696487, + "grad_norm": 0.16763992607593536, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4549 + }, + { + "epoch": 3.6341853035143767, + "grad_norm": 0.1486649513244629, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4550 + }, + { + "epoch": 3.6349840255591053, + "grad_norm": 0.06941305845975876, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4551 + }, + { + "epoch": 3.635782747603834, + "grad_norm": 0.1177566722035408, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4552 + }, + { + "epoch": 3.6365814696485623, + "grad_norm": 0.23368601500988007, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4553 + }, + { + "epoch": 3.637380191693291, + "grad_norm": 0.24657249450683594, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4554 + }, + { + "epoch": 3.6381789137380194, + "grad_norm": 0.10063605010509491, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4555 + }, + { + "epoch": 3.6389776357827475, + "grad_norm": 0.1553603708744049, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4556 + }, + { + "epoch": 3.639776357827476, + "grad_norm": 0.25588107109069824, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4557 + }, + { + "epoch": 3.6405750798722045, + "grad_norm": 0.15270236134529114, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4558 + }, + { + "epoch": 3.641373801916933, + "grad_norm": 0.108666330575943, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4559 + }, + { + "epoch": 3.642172523961661, + "grad_norm": 0.19828133285045624, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4560 + }, + { + "epoch": 3.6429712460063897, + "grad_norm": 0.21500051021575928, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4561 + }, + { + "epoch": 3.643769968051118, + "grad_norm": 0.16299934685230255, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4562 + }, + { + "epoch": 3.6445686900958467, + "grad_norm": 0.07390763610601425, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4563 + }, + { + "epoch": 3.6453674121405752, + "grad_norm": 0.22709119319915771, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4564 + }, + { + "epoch": 3.6461661341853038, + "grad_norm": 0.15557943284511566, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4565 + }, + { + "epoch": 3.646964856230032, + "grad_norm": 0.062457580119371414, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4566 + }, + { + "epoch": 3.6477635782747604, + "grad_norm": 0.09101095795631409, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4567 + }, + { + "epoch": 3.648562300319489, + "grad_norm": 0.08700825273990631, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 4568 + }, + { + "epoch": 3.649361022364217, + "grad_norm": 0.058703795075416565, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4569 + }, + { + "epoch": 3.6501597444089455, + "grad_norm": 0.056776538491249084, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4570 + }, + { + "epoch": 3.650958466453674, + "grad_norm": 0.062245409935712814, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4571 + }, + { + "epoch": 3.6517571884984026, + "grad_norm": 0.0534074492752552, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 4572 + }, + { + "epoch": 3.652555910543131, + "grad_norm": 0.09061384946107864, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4573 + }, + { + "epoch": 3.6533546325878596, + "grad_norm": 0.07323598116636276, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4574 + }, + { + "epoch": 3.6541533546325877, + "grad_norm": 0.1120329350233078, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 4575 + }, + { + "epoch": 3.6549520766773163, + "grad_norm": 0.07965485006570816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4576 + }, + { + "epoch": 3.655750798722045, + "grad_norm": 0.06320462375879288, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4577 + }, + { + "epoch": 3.6565495207667733, + "grad_norm": 0.07869421690702438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4578 + }, + { + "epoch": 3.6573482428115014, + "grad_norm": 0.09003151208162308, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4579 + }, + { + "epoch": 3.65814696485623, + "grad_norm": 0.05570388212800026, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4580 + }, + { + "epoch": 3.6589456869009584, + "grad_norm": 0.15563733875751495, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4581 + }, + { + "epoch": 3.659744408945687, + "grad_norm": 0.1422414481639862, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4582 + }, + { + "epoch": 3.6605431309904155, + "grad_norm": 0.13704177737236023, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4583 + }, + { + "epoch": 3.661341853035144, + "grad_norm": 0.36126458644866943, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4584 + }, + { + "epoch": 3.662140575079872, + "grad_norm": 0.09024632722139359, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4585 + }, + { + "epoch": 3.6629392971246006, + "grad_norm": 0.07135412096977234, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4586 + }, + { + "epoch": 3.663738019169329, + "grad_norm": 0.06172417849302292, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4587 + }, + { + "epoch": 3.6645367412140573, + "grad_norm": 0.05962595343589783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4588 + }, + { + "epoch": 3.665335463258786, + "grad_norm": 0.07063078880310059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4589 + }, + { + "epoch": 3.6661341853035143, + "grad_norm": 0.1445596069097519, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4590 + }, + { + "epoch": 3.666932907348243, + "grad_norm": 0.09224060922861099, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4591 + }, + { + "epoch": 3.6677316293929714, + "grad_norm": 0.10353037714958191, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4592 + }, + { + "epoch": 3.6685303514377, + "grad_norm": 0.10922796279191971, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4593 + }, + { + "epoch": 3.669329073482428, + "grad_norm": 0.08728764951229095, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4594 + }, + { + "epoch": 3.6701277955271565, + "grad_norm": 0.0639081671833992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4595 + }, + { + "epoch": 3.670926517571885, + "grad_norm": 0.050491299480199814, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4596 + }, + { + "epoch": 3.6717252396166136, + "grad_norm": 0.07127548009157181, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4597 + }, + { + "epoch": 3.6725239616613417, + "grad_norm": 0.05432606860995293, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4598 + }, + { + "epoch": 3.67332268370607, + "grad_norm": 0.0653342455625534, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4599 + }, + { + "epoch": 3.6741214057507987, + "grad_norm": 0.08766797184944153, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4600 + }, + { + "epoch": 3.6749201277955272, + "grad_norm": 0.0816602036356926, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4601 + }, + { + "epoch": 3.6757188498402558, + "grad_norm": 0.08774783462285995, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4602 + }, + { + "epoch": 3.6765175718849843, + "grad_norm": 0.07776570320129395, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4603 + }, + { + "epoch": 3.6773162939297124, + "grad_norm": 0.07067213952541351, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4604 + }, + { + "epoch": 3.678115015974441, + "grad_norm": 0.06581863760948181, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4605 + }, + { + "epoch": 3.6789137380191694, + "grad_norm": 0.08631278574466705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4606 + }, + { + "epoch": 3.6797124600638975, + "grad_norm": 0.10875384509563446, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4607 + }, + { + "epoch": 3.680511182108626, + "grad_norm": 0.11207764595746994, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4608 + }, + { + "epoch": 3.6813099041533546, + "grad_norm": 0.08943730592727661, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4609 + }, + { + "epoch": 3.682108626198083, + "grad_norm": 0.1922001987695694, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4610 + }, + { + "epoch": 3.6829073482428116, + "grad_norm": 0.10121189057826996, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4611 + }, + { + "epoch": 3.68370607028754, + "grad_norm": 0.05991055443882942, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4612 + }, + { + "epoch": 3.6845047923322682, + "grad_norm": 0.0897853821516037, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4613 + }, + { + "epoch": 3.6853035143769968, + "grad_norm": 0.13160353899002075, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4614 + }, + { + "epoch": 3.6861022364217253, + "grad_norm": 0.13855913281440735, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4615 + }, + { + "epoch": 3.686900958466454, + "grad_norm": 0.11086787283420563, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4616 + }, + { + "epoch": 3.687699680511182, + "grad_norm": 0.07992085069417953, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4617 + }, + { + "epoch": 3.6884984025559104, + "grad_norm": 0.11618958413600922, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4618 + }, + { + "epoch": 3.689297124600639, + "grad_norm": 0.19551296532154083, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4619 + }, + { + "epoch": 3.6900958466453675, + "grad_norm": 0.20239807665348053, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4620 + }, + { + "epoch": 3.690894568690096, + "grad_norm": 0.13233833014965057, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4621 + }, + { + "epoch": 3.6916932907348246, + "grad_norm": 0.08789848536252975, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4622 + }, + { + "epoch": 3.6924920127795526, + "grad_norm": 0.2363075315952301, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4623 + }, + { + "epoch": 3.693290734824281, + "grad_norm": 0.2585245668888092, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4624 + }, + { + "epoch": 3.6940894568690097, + "grad_norm": 0.15822109580039978, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4625 + }, + { + "epoch": 3.6948881789137378, + "grad_norm": 0.07197296619415283, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4626 + }, + { + "epoch": 3.6956869009584663, + "grad_norm": 0.21067900955677032, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4627 + }, + { + "epoch": 3.696485623003195, + "grad_norm": 0.19520802795886993, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4628 + }, + { + "epoch": 3.6972843450479234, + "grad_norm": 0.08310793340206146, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4629 + }, + { + "epoch": 3.698083067092652, + "grad_norm": 0.2118932604789734, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4630 + }, + { + "epoch": 3.6988817891373804, + "grad_norm": 0.2236505001783371, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4631 + }, + { + "epoch": 3.6996805111821085, + "grad_norm": 0.16256077587604523, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4632 + }, + { + "epoch": 3.700479233226837, + "grad_norm": 0.14406970143318176, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4633 + }, + { + "epoch": 3.7012779552715656, + "grad_norm": 0.09738676995038986, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4634 + }, + { + "epoch": 3.702076677316294, + "grad_norm": 0.07531408965587616, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4635 + }, + { + "epoch": 3.702875399361022, + "grad_norm": 0.11631188541650772, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4636 + }, + { + "epoch": 3.7036741214057507, + "grad_norm": 0.11661874502897263, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4637 + }, + { + "epoch": 3.7044728434504792, + "grad_norm": 0.11709950119256973, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 4638 + }, + { + "epoch": 3.7052715654952078, + "grad_norm": 0.13420704007148743, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4639 + }, + { + "epoch": 3.7060702875399363, + "grad_norm": 0.08842958509922028, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4640 + }, + { + "epoch": 3.706869009584665, + "grad_norm": 0.07295326143503189, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4641 + }, + { + "epoch": 3.707667731629393, + "grad_norm": 0.14573390781879425, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4642 + }, + { + "epoch": 3.7084664536741214, + "grad_norm": 0.06639868766069412, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4643 + }, + { + "epoch": 3.70926517571885, + "grad_norm": 0.05936001241207123, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4644 + }, + { + "epoch": 3.710063897763578, + "grad_norm": 0.06534209847450256, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4645 + }, + { + "epoch": 3.7108626198083066, + "grad_norm": 0.13101834058761597, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4646 + }, + { + "epoch": 3.711661341853035, + "grad_norm": 0.07707498222589493, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4647 + }, + { + "epoch": 3.7124600638977636, + "grad_norm": 0.09272165596485138, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4648 + }, + { + "epoch": 3.713258785942492, + "grad_norm": 0.12538838386535645, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4649 + }, + { + "epoch": 3.7140575079872207, + "grad_norm": 0.10816318541765213, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4650 + }, + { + "epoch": 3.7148562300319488, + "grad_norm": 0.10610290616750717, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4651 + }, + { + "epoch": 3.7156549520766773, + "grad_norm": 0.09520592540502548, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4652 + }, + { + "epoch": 3.716453674121406, + "grad_norm": 0.05595150217413902, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4653 + }, + { + "epoch": 3.7172523961661343, + "grad_norm": 0.08114545047283173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4654 + }, + { + "epoch": 3.7180511182108624, + "grad_norm": 0.16090086102485657, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4655 + }, + { + "epoch": 3.718849840255591, + "grad_norm": 0.16332058608531952, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4656 + }, + { + "epoch": 3.7196485623003195, + "grad_norm": 0.17694437503814697, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4657 + }, + { + "epoch": 3.720447284345048, + "grad_norm": 0.16341771185398102, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4658 + }, + { + "epoch": 3.7212460063897765, + "grad_norm": 0.12268038839101791, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4659 + }, + { + "epoch": 3.722044728434505, + "grad_norm": 0.09971031546592712, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4660 + }, + { + "epoch": 3.722843450479233, + "grad_norm": 0.08546486496925354, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4661 + }, + { + "epoch": 3.7236421725239617, + "grad_norm": 0.15427617728710175, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4662 + }, + { + "epoch": 3.72444089456869, + "grad_norm": 0.1291000247001648, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4663 + }, + { + "epoch": 3.7252396166134183, + "grad_norm": 0.06823746860027313, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4664 + }, + { + "epoch": 3.726038338658147, + "grad_norm": 0.08133388310670853, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4665 + }, + { + "epoch": 3.7268370607028753, + "grad_norm": 0.08803416788578033, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4666 + }, + { + "epoch": 3.727635782747604, + "grad_norm": 0.05898858234286308, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4667 + }, + { + "epoch": 3.7284345047923324, + "grad_norm": 0.07650687545537949, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 4668 + }, + { + "epoch": 3.729233226837061, + "grad_norm": 0.15048138797283173, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4669 + }, + { + "epoch": 3.730031948881789, + "grad_norm": 0.08594254404306412, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4670 + }, + { + "epoch": 3.7308306709265175, + "grad_norm": 0.05322937294840813, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4671 + }, + { + "epoch": 3.731629392971246, + "grad_norm": 0.14541727304458618, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4672 + }, + { + "epoch": 3.7324281150159746, + "grad_norm": 0.10300826281309128, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4673 + }, + { + "epoch": 3.7332268370607027, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4674 + }, + { + "epoch": 3.734025559105431, + "grad_norm": 0.07101032137870789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4675 + }, + { + "epoch": 3.7348242811501597, + "grad_norm": 0.09166763722896576, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4676 + }, + { + "epoch": 3.7356230031948883, + "grad_norm": 0.06929054856300354, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4677 + }, + { + "epoch": 3.736421725239617, + "grad_norm": 0.05935844033956528, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4678 + }, + { + "epoch": 3.737220447284345, + "grad_norm": 0.09101571142673492, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4679 + }, + { + "epoch": 3.7380191693290734, + "grad_norm": 0.0979514792561531, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4680 + }, + { + "epoch": 3.738817891373802, + "grad_norm": 0.07105522602796555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4681 + }, + { + "epoch": 3.7396166134185305, + "grad_norm": 0.05741708725690842, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4682 + }, + { + "epoch": 3.7404153354632586, + "grad_norm": 0.051515400409698486, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4683 + }, + { + "epoch": 3.741214057507987, + "grad_norm": 0.06484496593475342, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4684 + }, + { + "epoch": 3.7420127795527156, + "grad_norm": 0.056751761585474014, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4685 + }, + { + "epoch": 3.742811501597444, + "grad_norm": 0.09628041833639145, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4686 + }, + { + "epoch": 3.7436102236421727, + "grad_norm": 0.13367851078510284, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 4687 + }, + { + "epoch": 3.744408945686901, + "grad_norm": 0.10439570248126984, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4688 + }, + { + "epoch": 3.7452076677316293, + "grad_norm": 0.05516012758016586, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4689 + }, + { + "epoch": 3.746006389776358, + "grad_norm": 0.0721910372376442, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 4690 + }, + { + "epoch": 3.7468051118210863, + "grad_norm": 0.10327166318893433, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4691 + }, + { + "epoch": 3.747603833865815, + "grad_norm": 0.10419414937496185, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4692 + }, + { + "epoch": 3.748402555910543, + "grad_norm": 0.07322157919406891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4693 + }, + { + "epoch": 3.7492012779552715, + "grad_norm": 0.05000368133187294, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4694 + }, + { + "epoch": 3.75, + "grad_norm": 0.055239707231521606, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4695 + }, + { + "epoch": 3.7507987220447285, + "grad_norm": 0.14060117304325104, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 4696 + }, + { + "epoch": 3.751597444089457, + "grad_norm": 0.1366022527217865, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4697 + }, + { + "epoch": 3.752396166134185, + "grad_norm": 0.15003731846809387, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4698 + }, + { + "epoch": 3.7531948881789137, + "grad_norm": 0.11602472513914108, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4699 + }, + { + "epoch": 3.753993610223642, + "grad_norm": 0.06956090778112411, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4700 + }, + { + "epoch": 3.7547923322683707, + "grad_norm": 0.04711974412202835, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4701 + }, + { + "epoch": 3.755591054313099, + "grad_norm": 0.09257466346025467, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4702 + }, + { + "epoch": 3.7563897763578273, + "grad_norm": 0.06598426401615143, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4703 + }, + { + "epoch": 3.757188498402556, + "grad_norm": 0.06239036098122597, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4704 + }, + { + "epoch": 3.7579872204472844, + "grad_norm": 0.10065969824790955, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4705 + }, + { + "epoch": 3.758785942492013, + "grad_norm": 0.12874993681907654, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4706 + }, + { + "epoch": 3.7595846645367414, + "grad_norm": 0.10291960090398788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4707 + }, + { + "epoch": 3.7603833865814695, + "grad_norm": 0.06138000637292862, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4708 + }, + { + "epoch": 3.761182108626198, + "grad_norm": 0.11565262079238892, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4709 + }, + { + "epoch": 3.7619808306709266, + "grad_norm": 0.08041521906852722, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4710 + }, + { + "epoch": 3.762779552715655, + "grad_norm": 0.07228218764066696, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4711 + }, + { + "epoch": 3.763578274760383, + "grad_norm": 0.09155906736850739, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4712 + }, + { + "epoch": 3.7643769968051117, + "grad_norm": 0.07468429207801819, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4713 + }, + { + "epoch": 3.7651757188498403, + "grad_norm": 0.07629574090242386, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4714 + }, + { + "epoch": 3.765974440894569, + "grad_norm": 0.1118689477443695, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4715 + }, + { + "epoch": 3.7667731629392973, + "grad_norm": 0.07983580976724625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4716 + }, + { + "epoch": 3.7675718849840254, + "grad_norm": 0.07225694507360458, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4717 + }, + { + "epoch": 3.768370607028754, + "grad_norm": 0.1322079598903656, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4718 + }, + { + "epoch": 3.7691693290734825, + "grad_norm": 0.17217211425304413, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4719 + }, + { + "epoch": 3.769968051118211, + "grad_norm": 0.14665336906909943, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4720 + }, + { + "epoch": 3.770766773162939, + "grad_norm": 0.09977035969495773, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4721 + }, + { + "epoch": 3.7715654952076676, + "grad_norm": 0.1346946358680725, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4722 + }, + { + "epoch": 3.772364217252396, + "grad_norm": 0.17330871522426605, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 4723 + }, + { + "epoch": 3.7731629392971247, + "grad_norm": 0.17789506912231445, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4724 + }, + { + "epoch": 3.773961661341853, + "grad_norm": 0.06285518407821655, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4725 + }, + { + "epoch": 3.7747603833865817, + "grad_norm": 0.13192926347255707, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4726 + }, + { + "epoch": 3.77555910543131, + "grad_norm": 0.12157132476568222, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4727 + }, + { + "epoch": 3.7763578274760383, + "grad_norm": 0.1203337088227272, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4728 + }, + { + "epoch": 3.777156549520767, + "grad_norm": 0.16711866855621338, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4729 + }, + { + "epoch": 3.777955271565495, + "grad_norm": 0.13596504926681519, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4730 + }, + { + "epoch": 3.7787539936102235, + "grad_norm": 0.13502761721611023, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4731 + }, + { + "epoch": 3.779552715654952, + "grad_norm": 0.0751141607761383, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4732 + }, + { + "epoch": 3.7803514376996805, + "grad_norm": 0.1104620099067688, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4733 + }, + { + "epoch": 3.781150159744409, + "grad_norm": 0.06397949904203415, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4734 + }, + { + "epoch": 3.7819488817891376, + "grad_norm": 0.07850230485200882, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 4735 + }, + { + "epoch": 3.7827476038338657, + "grad_norm": 0.10330549627542496, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4736 + }, + { + "epoch": 3.783546325878594, + "grad_norm": 0.08978938311338425, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4737 + }, + { + "epoch": 3.7843450479233227, + "grad_norm": 0.07073058933019638, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4738 + }, + { + "epoch": 3.7851437699680512, + "grad_norm": 0.05997786670923233, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4739 + }, + { + "epoch": 3.7859424920127793, + "grad_norm": 0.0779404565691948, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4740 + }, + { + "epoch": 3.786741214057508, + "grad_norm": 0.1367640644311905, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4741 + }, + { + "epoch": 3.7875399361022364, + "grad_norm": 0.08670534938573837, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4742 + }, + { + "epoch": 3.788338658146965, + "grad_norm": 0.08612547069787979, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4743 + }, + { + "epoch": 3.7891373801916934, + "grad_norm": 0.06312929093837738, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4744 + }, + { + "epoch": 3.789936102236422, + "grad_norm": 0.06397293508052826, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4745 + }, + { + "epoch": 3.79073482428115, + "grad_norm": 0.0663115605711937, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4746 + }, + { + "epoch": 3.7915335463258786, + "grad_norm": 0.07580576092004776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4747 + }, + { + "epoch": 3.792332268370607, + "grad_norm": 0.12604761123657227, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4748 + }, + { + "epoch": 3.793130990415335, + "grad_norm": 0.08900050073862076, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4749 + }, + { + "epoch": 3.7939297124600637, + "grad_norm": 0.09280730038881302, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4750 + }, + { + "epoch": 3.7947284345047922, + "grad_norm": 0.17689163982868195, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 4751 + }, + { + "epoch": 3.7955271565495208, + "grad_norm": 0.06348183006048203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4752 + }, + { + "epoch": 3.7963258785942493, + "grad_norm": 0.12626387178897858, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 4753 + }, + { + "epoch": 3.797124600638978, + "grad_norm": 0.1138390377163887, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4754 + }, + { + "epoch": 3.797923322683706, + "grad_norm": 0.08058728277683258, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4755 + }, + { + "epoch": 3.7987220447284344, + "grad_norm": 0.09671882539987564, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4756 + }, + { + "epoch": 3.799520766773163, + "grad_norm": 0.12193922698497772, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4757 + }, + { + "epoch": 3.8003194888178915, + "grad_norm": 0.31105268001556396, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4758 + }, + { + "epoch": 3.8011182108626196, + "grad_norm": 0.10482051223516464, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4759 + }, + { + "epoch": 3.801916932907348, + "grad_norm": 0.09116382896900177, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4760 + }, + { + "epoch": 3.8027156549520766, + "grad_norm": 0.08212421089410782, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4761 + }, + { + "epoch": 3.803514376996805, + "grad_norm": 0.08267461508512497, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4762 + }, + { + "epoch": 3.8043130990415337, + "grad_norm": 0.13247907161712646, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4763 + }, + { + "epoch": 3.8051118210862622, + "grad_norm": 0.1083490327000618, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 4764 + }, + { + "epoch": 3.8059105431309903, + "grad_norm": 0.11947019398212433, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4765 + }, + { + "epoch": 3.806709265175719, + "grad_norm": 0.08462221175432205, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4766 + }, + { + "epoch": 3.8075079872204474, + "grad_norm": 0.07244928181171417, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4767 + }, + { + "epoch": 3.8083067092651754, + "grad_norm": 0.13432611525058746, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4768 + }, + { + "epoch": 3.809105431309904, + "grad_norm": 0.16640888154506683, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4769 + }, + { + "epoch": 3.8099041533546325, + "grad_norm": 0.12189232558012009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4770 + }, + { + "epoch": 3.810702875399361, + "grad_norm": 0.052367180585861206, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4771 + }, + { + "epoch": 3.8115015974440896, + "grad_norm": 0.10426424443721771, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4772 + }, + { + "epoch": 3.812300319488818, + "grad_norm": 0.11365417391061783, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4773 + }, + { + "epoch": 3.813099041533546, + "grad_norm": 0.07064168155193329, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4774 + }, + { + "epoch": 3.8138977635782747, + "grad_norm": 0.2107549011707306, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4775 + }, + { + "epoch": 3.8146964856230032, + "grad_norm": 0.2984449565410614, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4776 + }, + { + "epoch": 3.8154952076677318, + "grad_norm": 0.26252058148384094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4777 + }, + { + "epoch": 3.81629392971246, + "grad_norm": 0.08128907531499863, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4778 + }, + { + "epoch": 3.8170926517571884, + "grad_norm": 0.2724008858203888, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4779 + }, + { + "epoch": 3.817891373801917, + "grad_norm": 0.2646482288837433, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4780 + }, + { + "epoch": 3.8186900958466454, + "grad_norm": 0.16063876450061798, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4781 + }, + { + "epoch": 3.819488817891374, + "grad_norm": 0.11671862006187439, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4782 + }, + { + "epoch": 3.8202875399361025, + "grad_norm": 0.21605245769023895, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4783 + }, + { + "epoch": 3.8210862619808306, + "grad_norm": 0.17344583570957184, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4784 + }, + { + "epoch": 3.821884984025559, + "grad_norm": 0.08113347738981247, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4785 + }, + { + "epoch": 3.8226837060702876, + "grad_norm": 0.11774581670761108, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4786 + }, + { + "epoch": 3.8234824281150157, + "grad_norm": 0.2024560272693634, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4787 + }, + { + "epoch": 3.8242811501597442, + "grad_norm": 0.5578162670135498, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 4788 + }, + { + "epoch": 3.8250798722044728, + "grad_norm": 0.10354574024677277, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4789 + }, + { + "epoch": 3.8258785942492013, + "grad_norm": 0.14583979547023773, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4790 + }, + { + "epoch": 3.82667731629393, + "grad_norm": 0.15853755176067352, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4791 + }, + { + "epoch": 3.8274760383386583, + "grad_norm": 0.1308104395866394, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4792 + }, + { + "epoch": 3.8282747603833864, + "grad_norm": 0.04385368898510933, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4793 + }, + { + "epoch": 3.829073482428115, + "grad_norm": 0.16213825345039368, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4794 + }, + { + "epoch": 3.8298722044728435, + "grad_norm": 0.2693546414375305, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4795 + }, + { + "epoch": 3.830670926517572, + "grad_norm": 0.23904170095920563, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4796 + }, + { + "epoch": 3.8314696485623, + "grad_norm": 0.11313450336456299, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4797 + }, + { + "epoch": 3.8322683706070286, + "grad_norm": 0.0770820751786232, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4798 + }, + { + "epoch": 3.833067092651757, + "grad_norm": 0.8537606596946716, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4799 + }, + { + "epoch": 3.8338658146964857, + "grad_norm": 0.13684043288230896, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4800 + }, + { + "epoch": 3.834664536741214, + "grad_norm": 0.0890694409608841, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 4801 + }, + { + "epoch": 3.8354632587859427, + "grad_norm": 0.060917336493730545, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4802 + }, + { + "epoch": 3.836261980830671, + "grad_norm": 0.13864673674106598, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4803 + }, + { + "epoch": 3.8370607028753994, + "grad_norm": 0.15316139161586761, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4804 + }, + { + "epoch": 3.837859424920128, + "grad_norm": 0.061508018523454666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4805 + }, + { + "epoch": 3.838658146964856, + "grad_norm": 0.126112699508667, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4806 + }, + { + "epoch": 3.8394568690095845, + "grad_norm": 0.1663133054971695, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4807 + }, + { + "epoch": 3.840255591054313, + "grad_norm": 0.14435894787311554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4808 + }, + { + "epoch": 3.8410543130990416, + "grad_norm": 0.06042332574725151, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4809 + }, + { + "epoch": 3.84185303514377, + "grad_norm": 0.12759631872177124, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4810 + }, + { + "epoch": 3.8426517571884986, + "grad_norm": 0.18153302371501923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4811 + }, + { + "epoch": 3.8434504792332267, + "grad_norm": 0.1280708760023117, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4812 + }, + { + "epoch": 3.844249201277955, + "grad_norm": 0.07144157588481903, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4813 + }, + { + "epoch": 3.8450479233226837, + "grad_norm": 0.13078796863555908, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4814 + }, + { + "epoch": 3.8458466453674123, + "grad_norm": 0.16230762004852295, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4815 + }, + { + "epoch": 3.8466453674121404, + "grad_norm": 0.10997766256332397, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4816 + }, + { + "epoch": 3.847444089456869, + "grad_norm": 0.06006971001625061, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4817 + }, + { + "epoch": 3.8482428115015974, + "grad_norm": 0.10155797749757767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4818 + }, + { + "epoch": 3.849041533546326, + "grad_norm": 0.11125919967889786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4819 + }, + { + "epoch": 3.8498402555910545, + "grad_norm": 0.0860416367650032, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 4820 + }, + { + "epoch": 3.850638977635783, + "grad_norm": 0.0862870067358017, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4821 + }, + { + "epoch": 3.851437699680511, + "grad_norm": 0.07229744642972946, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4822 + }, + { + "epoch": 3.8522364217252396, + "grad_norm": 0.10448424518108368, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4823 + }, + { + "epoch": 3.853035143769968, + "grad_norm": 0.08971705287694931, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 4824 + }, + { + "epoch": 3.8538338658146962, + "grad_norm": 0.09876695275306702, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4825 + }, + { + "epoch": 3.8546325878594248, + "grad_norm": 0.0667971819639206, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4826 + }, + { + "epoch": 3.8554313099041533, + "grad_norm": 0.14437620341777802, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4827 + }, + { + "epoch": 3.856230031948882, + "grad_norm": 0.17627735435962677, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4828 + }, + { + "epoch": 3.8570287539936103, + "grad_norm": 0.10524439066648483, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4829 + }, + { + "epoch": 3.857827476038339, + "grad_norm": 0.15091893076896667, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4830 + }, + { + "epoch": 3.858626198083067, + "grad_norm": 0.22534102201461792, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4831 + }, + { + "epoch": 3.8594249201277955, + "grad_norm": 0.08298768103122711, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4832 + }, + { + "epoch": 3.860223642172524, + "grad_norm": 0.16647395491600037, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4833 + }, + { + "epoch": 3.8610223642172525, + "grad_norm": 0.22512534260749817, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4834 + }, + { + "epoch": 3.8618210862619806, + "grad_norm": 0.2130710482597351, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4835 + }, + { + "epoch": 3.862619808306709, + "grad_norm": 0.1250864863395691, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4836 + }, + { + "epoch": 3.8634185303514377, + "grad_norm": 0.13937048614025116, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4837 + }, + { + "epoch": 3.864217252396166, + "grad_norm": 0.19059741497039795, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4838 + }, + { + "epoch": 3.8650159744408947, + "grad_norm": 0.22080829739570618, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4839 + }, + { + "epoch": 3.8658146964856233, + "grad_norm": 0.09463749825954437, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 4840 + }, + { + "epoch": 3.8666134185303513, + "grad_norm": 0.16431698203086853, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4841 + }, + { + "epoch": 3.86741214057508, + "grad_norm": 0.2162260264158249, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4842 + }, + { + "epoch": 3.8682108626198084, + "grad_norm": 0.0789603665471077, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4843 + }, + { + "epoch": 3.8690095846645365, + "grad_norm": 0.18372099101543427, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4844 + }, + { + "epoch": 3.869808306709265, + "grad_norm": 0.24845194816589355, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4845 + }, + { + "epoch": 3.8706070287539935, + "grad_norm": 0.22064632177352905, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4846 + }, + { + "epoch": 3.871405750798722, + "grad_norm": 0.0718264952301979, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4847 + }, + { + "epoch": 3.8722044728434506, + "grad_norm": 0.2048031985759735, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4848 + }, + { + "epoch": 3.873003194888179, + "grad_norm": 0.23190200328826904, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4849 + }, + { + "epoch": 3.873801916932907, + "grad_norm": 0.06851150840520859, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4850 + }, + { + "epoch": 3.8746006389776357, + "grad_norm": 0.2371164858341217, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4851 + }, + { + "epoch": 3.8753993610223643, + "grad_norm": 0.23518243432044983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4852 + }, + { + "epoch": 3.876198083067093, + "grad_norm": 0.08026961237192154, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4853 + }, + { + "epoch": 3.876996805111821, + "grad_norm": 0.1623634397983551, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4854 + }, + { + "epoch": 3.8777955271565494, + "grad_norm": 0.21676453948020935, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4855 + }, + { + "epoch": 3.878594249201278, + "grad_norm": 0.07868681848049164, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4856 + }, + { + "epoch": 3.8793929712460065, + "grad_norm": 0.18302997946739197, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4857 + }, + { + "epoch": 3.880191693290735, + "grad_norm": 0.2338407188653946, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4858 + }, + { + "epoch": 3.8809904153354635, + "grad_norm": 0.2534898817539215, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4859 + }, + { + "epoch": 3.8817891373801916, + "grad_norm": 0.19988521933555603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4860 + }, + { + "epoch": 3.88258785942492, + "grad_norm": 0.2896076440811157, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4861 + }, + { + "epoch": 3.8833865814696487, + "grad_norm": 0.1088651567697525, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4862 + }, + { + "epoch": 3.8841853035143767, + "grad_norm": 0.18549342453479767, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4863 + }, + { + "epoch": 3.8849840255591053, + "grad_norm": 0.24760019779205322, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4864 + }, + { + "epoch": 3.885782747603834, + "grad_norm": 0.1323750913143158, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4865 + }, + { + "epoch": 3.8865814696485623, + "grad_norm": 0.14235283434391022, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4866 + }, + { + "epoch": 3.887380191693291, + "grad_norm": 0.20409083366394043, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4867 + }, + { + "epoch": 3.8881789137380194, + "grad_norm": 0.1743297129869461, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4868 + }, + { + "epoch": 3.8889776357827475, + "grad_norm": 0.09692966938018799, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4869 + }, + { + "epoch": 3.889776357827476, + "grad_norm": 0.09934467077255249, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4870 + }, + { + "epoch": 3.8905750798722045, + "grad_norm": 0.2410827875137329, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4871 + }, + { + "epoch": 3.891373801916933, + "grad_norm": 0.27096229791641235, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4872 + }, + { + "epoch": 3.892172523961661, + "grad_norm": 0.09133906662464142, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4873 + }, + { + "epoch": 3.8929712460063897, + "grad_norm": 0.20275604724884033, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4874 + }, + { + "epoch": 3.893769968051118, + "grad_norm": 0.19578030705451965, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4875 + }, + { + "epoch": 3.8945686900958467, + "grad_norm": 0.12888970971107483, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4876 + }, + { + "epoch": 3.8953674121405752, + "grad_norm": 0.10301528871059418, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4877 + }, + { + "epoch": 3.8961661341853038, + "grad_norm": 0.1635914444923401, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4878 + }, + { + "epoch": 3.896964856230032, + "grad_norm": 0.1971803456544876, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4879 + }, + { + "epoch": 3.8977635782747604, + "grad_norm": 0.1085273027420044, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4880 + }, + { + "epoch": 3.898562300319489, + "grad_norm": 0.07375707477331161, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4881 + }, + { + "epoch": 3.899361022364217, + "grad_norm": 0.5828747153282166, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4882 + }, + { + "epoch": 3.9001597444089455, + "grad_norm": 0.10320120304822922, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4883 + }, + { + "epoch": 3.900958466453674, + "grad_norm": 0.10118676722049713, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4884 + }, + { + "epoch": 3.9017571884984026, + "grad_norm": 0.22034543752670288, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4885 + }, + { + "epoch": 3.902555910543131, + "grad_norm": 0.21823646128177643, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4886 + }, + { + "epoch": 3.9033546325878596, + "grad_norm": 0.14776065945625305, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4887 + }, + { + "epoch": 3.9041533546325877, + "grad_norm": 0.13297663629055023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4888 + }, + { + "epoch": 3.9049520766773163, + "grad_norm": 0.4447253942489624, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4889 + }, + { + "epoch": 3.905750798722045, + "grad_norm": 0.171112522482872, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4890 + }, + { + "epoch": 3.9065495207667733, + "grad_norm": 0.1581616848707199, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4891 + }, + { + "epoch": 3.9073482428115014, + "grad_norm": 0.18396562337875366, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4892 + }, + { + "epoch": 3.90814696485623, + "grad_norm": 0.15952393412590027, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4893 + }, + { + "epoch": 3.9089456869009584, + "grad_norm": 0.12889564037322998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4894 + }, + { + "epoch": 3.909744408945687, + "grad_norm": 0.130104660987854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4895 + }, + { + "epoch": 3.9105431309904155, + "grad_norm": 0.13011464476585388, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4896 + }, + { + "epoch": 3.911341853035144, + "grad_norm": 0.06485363095998764, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4897 + }, + { + "epoch": 3.912140575079872, + "grad_norm": 0.11353932321071625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4898 + }, + { + "epoch": 3.9129392971246006, + "grad_norm": 0.13279879093170166, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4899 + }, + { + "epoch": 3.913738019169329, + "grad_norm": 0.19181469082832336, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 4900 + }, + { + "epoch": 3.9145367412140573, + "grad_norm": 0.06930892914533615, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4901 + }, + { + "epoch": 3.915335463258786, + "grad_norm": 0.10591714829206467, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4902 + }, + { + "epoch": 3.9161341853035143, + "grad_norm": 0.09693296998739243, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4903 + }, + { + "epoch": 3.916932907348243, + "grad_norm": 0.1604270488023758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4904 + }, + { + "epoch": 3.9177316293929714, + "grad_norm": 0.19874586164951324, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4905 + }, + { + "epoch": 3.9185303514377, + "grad_norm": 0.09015987068414688, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4906 + }, + { + "epoch": 3.919329073482428, + "grad_norm": 0.09864864498376846, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4907 + }, + { + "epoch": 3.9201277955271565, + "grad_norm": 0.12509673833847046, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4908 + }, + { + "epoch": 3.920926517571885, + "grad_norm": 0.10216362774372101, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4909 + }, + { + "epoch": 3.9217252396166136, + "grad_norm": 0.11854741722345352, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4910 + }, + { + "epoch": 3.9225239616613417, + "grad_norm": 0.08570919930934906, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4911 + }, + { + "epoch": 3.92332268370607, + "grad_norm": 0.095781609416008, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4912 + }, + { + "epoch": 3.9241214057507987, + "grad_norm": 0.05698491260409355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4913 + }, + { + "epoch": 3.9249201277955272, + "grad_norm": 0.09786297380924225, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4914 + }, + { + "epoch": 3.9257188498402558, + "grad_norm": 0.1206512302160263, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4915 + }, + { + "epoch": 3.9265175718849843, + "grad_norm": 0.07593982666730881, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4916 + }, + { + "epoch": 3.9273162939297124, + "grad_norm": 0.06973730027675629, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4917 + }, + { + "epoch": 3.928115015974441, + "grad_norm": 0.07377546280622482, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4918 + }, + { + "epoch": 3.9289137380191694, + "grad_norm": 0.06871537119150162, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4919 + }, + { + "epoch": 3.9297124600638975, + "grad_norm": 0.09697525203227997, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4920 + }, + { + "epoch": 3.930511182108626, + "grad_norm": 0.07418478280305862, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4921 + }, + { + "epoch": 3.9313099041533546, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4922 + }, + { + "epoch": 3.932108626198083, + "grad_norm": 0.08099815994501114, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4923 + }, + { + "epoch": 3.9329073482428116, + "grad_norm": 0.08033913373947144, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4924 + }, + { + "epoch": 3.93370607028754, + "grad_norm": 0.1089775413274765, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4925 + }, + { + "epoch": 3.9345047923322682, + "grad_norm": 0.06866748631000519, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4926 + }, + { + "epoch": 3.9353035143769968, + "grad_norm": 0.12346489727497101, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4927 + }, + { + "epoch": 3.9361022364217253, + "grad_norm": 0.1388891190290451, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4928 + }, + { + "epoch": 3.936900958466454, + "grad_norm": 0.12678411602973938, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4929 + }, + { + "epoch": 3.937699680511182, + "grad_norm": 0.08638305962085724, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4930 + }, + { + "epoch": 3.9384984025559104, + "grad_norm": 0.667020320892334, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 4931 + }, + { + "epoch": 3.939297124600639, + "grad_norm": 0.0867542177438736, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4932 + }, + { + "epoch": 3.9400958466453675, + "grad_norm": 0.1075657457113266, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4933 + }, + { + "epoch": 3.940894568690096, + "grad_norm": 0.10359356552362442, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4934 + }, + { + "epoch": 3.9416932907348246, + "grad_norm": 0.04861772805452347, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4935 + }, + { + "epoch": 3.9424920127795526, + "grad_norm": 0.08871651440858841, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4936 + }, + { + "epoch": 3.943290734824281, + "grad_norm": 0.05268944799900055, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4937 + }, + { + "epoch": 3.9440894568690097, + "grad_norm": 0.11428069323301315, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4938 + }, + { + "epoch": 3.9448881789137378, + "grad_norm": 0.1302616149187088, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4939 + }, + { + "epoch": 3.9456869009584663, + "grad_norm": 0.09091098606586456, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4940 + }, + { + "epoch": 3.946485623003195, + "grad_norm": 0.23224923014640808, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4941 + }, + { + "epoch": 3.9472843450479234, + "grad_norm": 0.13427230715751648, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4942 + }, + { + "epoch": 3.948083067092652, + "grad_norm": 0.24157744646072388, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4943 + }, + { + "epoch": 3.9488817891373804, + "grad_norm": 0.15497569739818573, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4944 + }, + { + "epoch": 3.9496805111821085, + "grad_norm": 0.15587151050567627, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4945 + }, + { + "epoch": 3.950479233226837, + "grad_norm": 0.0827038437128067, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4946 + }, + { + "epoch": 3.9512779552715656, + "grad_norm": 0.17405007779598236, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4947 + }, + { + "epoch": 3.952076677316294, + "grad_norm": 0.1612532138824463, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4948 + }, + { + "epoch": 3.952875399361022, + "grad_norm": 0.07505665719509125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4949 + }, + { + "epoch": 3.9536741214057507, + "grad_norm": 0.07138567417860031, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4950 + }, + { + "epoch": 3.9544728434504792, + "grad_norm": 0.09206511080265045, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4951 + }, + { + "epoch": 3.9552715654952078, + "grad_norm": 0.09190725535154343, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4952 + }, + { + "epoch": 3.9560702875399363, + "grad_norm": 0.13024544715881348, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4953 + }, + { + "epoch": 3.956869009584665, + "grad_norm": 0.08161026239395142, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4954 + }, + { + "epoch": 3.957667731629393, + "grad_norm": 0.17207187414169312, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4955 + }, + { + "epoch": 3.9584664536741214, + "grad_norm": 0.096051886677742, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4956 + }, + { + "epoch": 3.95926517571885, + "grad_norm": 0.11038299649953842, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4957 + }, + { + "epoch": 3.960063897763578, + "grad_norm": 0.09957583248615265, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4958 + }, + { + "epoch": 3.9608626198083066, + "grad_norm": 0.06923667341470718, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4959 + }, + { + "epoch": 3.961661341853035, + "grad_norm": 0.07572069019079208, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4960 + }, + { + "epoch": 3.9624600638977636, + "grad_norm": 0.16801652312278748, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4961 + }, + { + "epoch": 3.963258785942492, + "grad_norm": 0.062117498368024826, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4962 + }, + { + "epoch": 3.9640575079872207, + "grad_norm": 0.08293396979570389, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4963 + }, + { + "epoch": 3.9648562300319488, + "grad_norm": 0.2021675407886505, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4964 + }, + { + "epoch": 3.9656549520766773, + "grad_norm": 0.10666973143815994, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4965 + }, + { + "epoch": 3.966453674121406, + "grad_norm": 0.09226572513580322, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4966 + }, + { + "epoch": 3.9672523961661343, + "grad_norm": 0.10113741457462311, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4967 + }, + { + "epoch": 3.9680511182108624, + "grad_norm": 0.10156626254320145, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4968 + }, + { + "epoch": 3.968849840255591, + "grad_norm": 0.08531442284584045, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4969 + }, + { + "epoch": 3.9696485623003195, + "grad_norm": 0.08894761651754379, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4970 + }, + { + "epoch": 3.970447284345048, + "grad_norm": 0.07934322953224182, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4971 + }, + { + "epoch": 3.9712460063897765, + "grad_norm": 0.07121701538562775, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4972 + }, + { + "epoch": 3.972044728434505, + "grad_norm": 0.09110251814126968, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4973 + }, + { + "epoch": 3.972843450479233, + "grad_norm": 0.09724952280521393, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4974 + }, + { + "epoch": 3.9736421725239617, + "grad_norm": 0.08619683235883713, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4975 + }, + { + "epoch": 3.97444089456869, + "grad_norm": 0.14789989590644836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4976 + }, + { + "epoch": 3.9752396166134183, + "grad_norm": 0.08736634254455566, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4977 + }, + { + "epoch": 3.976038338658147, + "grad_norm": 0.2260635793209076, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4978 + }, + { + "epoch": 3.9768370607028753, + "grad_norm": 0.2150910496711731, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4979 + }, + { + "epoch": 3.977635782747604, + "grad_norm": 0.12071242183446884, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4980 + }, + { + "epoch": 3.9784345047923324, + "grad_norm": 0.11614276468753815, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4981 + }, + { + "epoch": 3.979233226837061, + "grad_norm": 0.0954839214682579, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4982 + }, + { + "epoch": 3.980031948881789, + "grad_norm": 0.09801400452852249, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4983 + }, + { + "epoch": 3.9808306709265175, + "grad_norm": 0.07435343414545059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4984 + }, + { + "epoch": 3.981629392971246, + "grad_norm": 0.09401766955852509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4985 + }, + { + "epoch": 3.9824281150159746, + "grad_norm": 0.09850753843784332, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4986 + }, + { + "epoch": 3.9832268370607027, + "grad_norm": 0.07880235463380814, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4987 + }, + { + "epoch": 3.984025559105431, + "grad_norm": 0.08208848536014557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4988 + }, + { + "epoch": 3.9848242811501597, + "grad_norm": 0.10432668030261993, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4989 + }, + { + "epoch": 3.9856230031948883, + "grad_norm": 0.05202944204211235, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4990 + }, + { + "epoch": 3.986421725239617, + "grad_norm": 0.0831860601902008, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4991 + }, + { + "epoch": 3.987220447284345, + "grad_norm": 0.1084689050912857, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4992 + }, + { + "epoch": 3.9880191693290734, + "grad_norm": 0.1095893383026123, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4993 + }, + { + "epoch": 3.988817891373802, + "grad_norm": 0.24480414390563965, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4994 + }, + { + "epoch": 3.9896166134185305, + "grad_norm": 0.11939835548400879, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4995 + }, + { + "epoch": 3.9904153354632586, + "grad_norm": 0.0829034298658371, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4996 + }, + { + "epoch": 3.991214057507987, + "grad_norm": 0.1649356484413147, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4997 + }, + { + "epoch": 3.9920127795527156, + "grad_norm": 0.18428824841976166, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4998 + }, + { + "epoch": 3.992811501597444, + "grad_norm": 0.14441022276878357, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4999 + }, + { + "epoch": 3.9936102236421727, + "grad_norm": 0.1025838553905487, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5000 + }, + { + "epoch": 3.994408945686901, + "grad_norm": 0.18659353256225586, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5001 + }, + { + "epoch": 3.9952076677316293, + "grad_norm": 0.18462489545345306, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5002 + }, + { + "epoch": 3.996006389776358, + "grad_norm": 0.11221570521593094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5003 + }, + { + "epoch": 3.9968051118210863, + "grad_norm": 0.1611207127571106, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5004 + }, + { + "epoch": 3.997603833865815, + "grad_norm": 0.10003258287906647, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5005 + }, + { + "epoch": 3.998402555910543, + "grad_norm": 0.06686410307884216, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5006 + }, + { + "epoch": 3.9992012779552715, + "grad_norm": 0.07527180016040802, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5007 + }, + { + "epoch": 4.0, + "grad_norm": 0.11602520197629929, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5008 + }, + { + "epoch": 4.0007987220447285, + "grad_norm": 0.04460546746850014, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5009 + }, + { + "epoch": 4.001597444089457, + "grad_norm": 1.1286108493804932, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5010 + }, + { + "epoch": 4.002396166134186, + "grad_norm": 0.12730571627616882, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5011 + }, + { + "epoch": 4.003194888178914, + "grad_norm": 0.060798924416303635, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5012 + }, + { + "epoch": 4.003993610223642, + "grad_norm": 0.11491188406944275, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5013 + }, + { + "epoch": 4.00479233226837, + "grad_norm": 0.09877663850784302, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5014 + }, + { + "epoch": 4.005591054313099, + "grad_norm": 0.06991511583328247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5015 + }, + { + "epoch": 4.006389776357827, + "grad_norm": 0.05524459481239319, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5016 + }, + { + "epoch": 4.007188498402556, + "grad_norm": 0.07421471178531647, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5017 + }, + { + "epoch": 4.007987220447284, + "grad_norm": 0.10918284207582474, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5018 + }, + { + "epoch": 4.008785942492013, + "grad_norm": 0.42926761507987976, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5019 + }, + { + "epoch": 4.0095846645367414, + "grad_norm": 0.12511351704597473, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5020 + }, + { + "epoch": 4.01038338658147, + "grad_norm": 0.0985826924443245, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5021 + }, + { + "epoch": 4.0111821086261985, + "grad_norm": 0.10876046866178513, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5022 + }, + { + "epoch": 4.011980830670926, + "grad_norm": 0.0973401740193367, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5023 + }, + { + "epoch": 4.012779552715655, + "grad_norm": 0.10867046564817429, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5024 + }, + { + "epoch": 4.013578274760383, + "grad_norm": 0.16030259430408478, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5025 + }, + { + "epoch": 4.014376996805112, + "grad_norm": 0.09972470998764038, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5026 + }, + { + "epoch": 4.01517571884984, + "grad_norm": 0.06945701688528061, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5027 + }, + { + "epoch": 4.015974440894569, + "grad_norm": 0.12256570160388947, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5028 + }, + { + "epoch": 4.016773162939297, + "grad_norm": 0.1318589597940445, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5029 + }, + { + "epoch": 4.017571884984026, + "grad_norm": 0.14831772446632385, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5030 + }, + { + "epoch": 4.018370607028754, + "grad_norm": 0.12650129199028015, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5031 + }, + { + "epoch": 4.019169329073482, + "grad_norm": 0.25457820296287537, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5032 + }, + { + "epoch": 4.0199680511182105, + "grad_norm": 0.10183271020650864, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5033 + }, + { + "epoch": 4.020766773162939, + "grad_norm": 0.14198726415634155, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5034 + }, + { + "epoch": 4.021565495207668, + "grad_norm": 0.1551627218723297, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5035 + }, + { + "epoch": 4.022364217252396, + "grad_norm": 0.29212328791618347, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5036 + }, + { + "epoch": 4.023162939297125, + "grad_norm": 0.25203290581703186, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5037 + }, + { + "epoch": 4.023961661341853, + "grad_norm": 0.12793950736522675, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5038 + }, + { + "epoch": 4.024760383386582, + "grad_norm": 0.10916420817375183, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5039 + }, + { + "epoch": 4.02555910543131, + "grad_norm": 0.09980735182762146, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5040 + }, + { + "epoch": 4.026357827476039, + "grad_norm": 0.1633901745080948, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5041 + }, + { + "epoch": 4.027156549520766, + "grad_norm": 0.10058299452066422, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5042 + }, + { + "epoch": 4.027955271565495, + "grad_norm": 0.08121561259031296, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5043 + }, + { + "epoch": 4.0287539936102235, + "grad_norm": 0.19947005808353424, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5044 + }, + { + "epoch": 4.029552715654952, + "grad_norm": 0.24219068884849548, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5045 + }, + { + "epoch": 4.0303514376996805, + "grad_norm": 0.28928735852241516, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5046 + }, + { + "epoch": 4.031150159744409, + "grad_norm": 0.062404267489910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5047 + }, + { + "epoch": 4.031948881789138, + "grad_norm": 0.1607569456100464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5048 + }, + { + "epoch": 4.032747603833866, + "grad_norm": 0.14420244097709656, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5049 + }, + { + "epoch": 4.033546325878595, + "grad_norm": 0.838013768196106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5050 + }, + { + "epoch": 4.034345047923322, + "grad_norm": 0.15198078751564026, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5051 + }, + { + "epoch": 4.035143769968051, + "grad_norm": 0.18439999222755432, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5052 + }, + { + "epoch": 4.035942492012779, + "grad_norm": 0.1283460259437561, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5053 + }, + { + "epoch": 4.036741214057508, + "grad_norm": 0.07285412400960922, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5054 + }, + { + "epoch": 4.037539936102236, + "grad_norm": 0.21856451034545898, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5055 + }, + { + "epoch": 4.038338658146965, + "grad_norm": 0.1934041529893875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5056 + }, + { + "epoch": 4.039137380191693, + "grad_norm": 0.07998216152191162, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5057 + }, + { + "epoch": 4.039936102236422, + "grad_norm": 0.2202988713979721, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5058 + }, + { + "epoch": 4.0407348242811505, + "grad_norm": 0.22000271081924438, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5059 + }, + { + "epoch": 4.041533546325879, + "grad_norm": 0.06229308247566223, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5060 + }, + { + "epoch": 4.042332268370607, + "grad_norm": 0.19611188769340515, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5061 + }, + { + "epoch": 4.043130990415335, + "grad_norm": 0.2385999858379364, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5062 + }, + { + "epoch": 4.043929712460064, + "grad_norm": 0.06504995375871658, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5063 + }, + { + "epoch": 4.044728434504792, + "grad_norm": 0.17860567569732666, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5064 + }, + { + "epoch": 4.045527156549521, + "grad_norm": 0.17580853402614594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5065 + }, + { + "epoch": 4.046325878594249, + "grad_norm": 0.06523217260837555, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5066 + }, + { + "epoch": 4.047124600638978, + "grad_norm": 0.2795565128326416, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5067 + }, + { + "epoch": 4.047923322683706, + "grad_norm": 0.289105623960495, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5068 + }, + { + "epoch": 4.048722044728435, + "grad_norm": 0.07829197496175766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5069 + }, + { + "epoch": 4.0495207667731625, + "grad_norm": 0.24165435135364532, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5070 + }, + { + "epoch": 4.050319488817891, + "grad_norm": 0.2785094976425171, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5071 + }, + { + "epoch": 4.05111821086262, + "grad_norm": 0.08929550647735596, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5072 + }, + { + "epoch": 4.051916932907348, + "grad_norm": 0.24677781760692596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5073 + }, + { + "epoch": 4.052715654952077, + "grad_norm": 0.25207674503326416, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5074 + }, + { + "epoch": 4.053514376996805, + "grad_norm": 0.06409729272127151, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5075 + }, + { + "epoch": 4.054313099041534, + "grad_norm": 0.2670205235481262, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5076 + }, + { + "epoch": 4.055111821086262, + "grad_norm": 0.1854943484067917, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5077 + }, + { + "epoch": 4.055910543130991, + "grad_norm": 0.1409354954957962, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5078 + }, + { + "epoch": 4.056709265175719, + "grad_norm": 0.24084609746932983, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5079 + }, + { + "epoch": 4.057507987220447, + "grad_norm": 0.16520382463932037, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5080 + }, + { + "epoch": 4.0583067092651754, + "grad_norm": 0.11086967587471008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5081 + }, + { + "epoch": 4.059105431309904, + "grad_norm": 0.15748612582683563, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5082 + }, + { + "epoch": 4.0599041533546325, + "grad_norm": 0.1196034848690033, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5083 + }, + { + "epoch": 4.060702875399361, + "grad_norm": 0.06799823045730591, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5084 + }, + { + "epoch": 4.06150159744409, + "grad_norm": 0.1223025768995285, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5085 + }, + { + "epoch": 4.062300319488818, + "grad_norm": 0.04760991781949997, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5086 + }, + { + "epoch": 4.063099041533547, + "grad_norm": 0.11782078444957733, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5087 + }, + { + "epoch": 4.063897763578275, + "grad_norm": 0.13057227432727814, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5088 + }, + { + "epoch": 4.064696485623003, + "grad_norm": 0.0719611644744873, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5089 + }, + { + "epoch": 4.065495207667731, + "grad_norm": 0.13513247668743134, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5090 + }, + { + "epoch": 4.06629392971246, + "grad_norm": 0.14960692822933197, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5091 + }, + { + "epoch": 4.067092651757188, + "grad_norm": 0.06219497323036194, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5092 + }, + { + "epoch": 4.067891373801917, + "grad_norm": 0.06755383312702179, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5093 + }, + { + "epoch": 4.068690095846645, + "grad_norm": 0.08237830549478531, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5094 + }, + { + "epoch": 4.069488817891374, + "grad_norm": 0.0915946289896965, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5095 + }, + { + "epoch": 4.0702875399361025, + "grad_norm": 0.06893479824066162, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5096 + }, + { + "epoch": 4.071086261980831, + "grad_norm": 0.04133071005344391, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5097 + }, + { + "epoch": 4.0718849840255595, + "grad_norm": 0.062333185225725174, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5098 + }, + { + "epoch": 4.072683706070287, + "grad_norm": 0.05741016939282417, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5099 + }, + { + "epoch": 4.073482428115016, + "grad_norm": 0.04988866671919823, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5100 + }, + { + "epoch": 4.074281150159744, + "grad_norm": 0.050187818706035614, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5101 + }, + { + "epoch": 4.075079872204473, + "grad_norm": 0.08479643613100052, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5102 + }, + { + "epoch": 4.075878594249201, + "grad_norm": 0.13840351998806, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5103 + }, + { + "epoch": 4.07667731629393, + "grad_norm": 0.11400903016328812, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5104 + }, + { + "epoch": 4.077476038338658, + "grad_norm": 0.06956811994314194, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5105 + }, + { + "epoch": 4.078274760383387, + "grad_norm": 0.09173833578824997, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5106 + }, + { + "epoch": 4.079073482428115, + "grad_norm": 0.09024006128311157, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5107 + }, + { + "epoch": 4.079872204472843, + "grad_norm": 0.04257406294345856, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5108 + }, + { + "epoch": 4.080670926517572, + "grad_norm": 0.04252707585692406, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5109 + }, + { + "epoch": 4.0814696485623, + "grad_norm": 0.052367035299539566, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5110 + }, + { + "epoch": 4.082268370607029, + "grad_norm": 0.06344939023256302, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5111 + }, + { + "epoch": 4.083067092651757, + "grad_norm": 0.04674215242266655, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5112 + }, + { + "epoch": 4.083865814696486, + "grad_norm": 0.03664534166455269, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5113 + }, + { + "epoch": 4.084664536741214, + "grad_norm": 0.07198764383792877, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5114 + }, + { + "epoch": 4.085463258785943, + "grad_norm": 0.06294529885053635, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5115 + }, + { + "epoch": 4.086261980830671, + "grad_norm": 0.09595668315887451, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5116 + }, + { + "epoch": 4.0870607028754, + "grad_norm": 0.09830893576145172, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5117 + }, + { + "epoch": 4.087859424920127, + "grad_norm": 0.09647611528635025, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5118 + }, + { + "epoch": 4.088658146964856, + "grad_norm": 0.04558149725198746, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5119 + }, + { + "epoch": 4.0894568690095845, + "grad_norm": 0.11090628057718277, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5120 + }, + { + "epoch": 4.090255591054313, + "grad_norm": 0.1119648665189743, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5121 + }, + { + "epoch": 4.0910543130990416, + "grad_norm": 0.0372939296066761, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5122 + }, + { + "epoch": 4.09185303514377, + "grad_norm": 0.10749047994613647, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5123 + }, + { + "epoch": 4.092651757188499, + "grad_norm": 0.08718341588973999, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5124 + }, + { + "epoch": 4.093450479233227, + "grad_norm": 0.04954478517174721, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5125 + }, + { + "epoch": 4.094249201277956, + "grad_norm": 0.0599503293633461, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5126 + }, + { + "epoch": 4.095047923322683, + "grad_norm": 0.04633599892258644, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5127 + }, + { + "epoch": 4.095846645367412, + "grad_norm": 0.0502074733376503, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5128 + }, + { + "epoch": 4.09664536741214, + "grad_norm": 0.1348472684621811, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5129 + }, + { + "epoch": 4.097444089456869, + "grad_norm": 0.07534858584403992, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5130 + }, + { + "epoch": 4.098242811501597, + "grad_norm": 0.04207107052206993, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5131 + }, + { + "epoch": 4.099041533546326, + "grad_norm": 0.062090687453746796, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5132 + }, + { + "epoch": 4.0998402555910545, + "grad_norm": 0.08783479779958725, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5133 + }, + { + "epoch": 4.100638977635783, + "grad_norm": 0.04489055275917053, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5134 + }, + { + "epoch": 4.1014376996805115, + "grad_norm": 0.07360105961561203, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5135 + }, + { + "epoch": 4.102236421725239, + "grad_norm": 0.10253020375967026, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5136 + }, + { + "epoch": 4.103035143769968, + "grad_norm": 0.12787389755249023, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5137 + }, + { + "epoch": 4.103833865814696, + "grad_norm": 0.43946513533592224, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5138 + }, + { + "epoch": 4.104632587859425, + "grad_norm": 0.7717093825340271, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5139 + }, + { + "epoch": 4.105431309904153, + "grad_norm": 0.1433849334716797, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5140 + }, + { + "epoch": 4.106230031948882, + "grad_norm": 0.09110052138566971, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5141 + }, + { + "epoch": 4.10702875399361, + "grad_norm": 0.13785111904144287, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5142 + }, + { + "epoch": 4.107827476038339, + "grad_norm": 0.0910695344209671, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5143 + }, + { + "epoch": 4.108626198083067, + "grad_norm": 0.10390721261501312, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5144 + }, + { + "epoch": 4.109424920127796, + "grad_norm": 0.07039178162813187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5145 + }, + { + "epoch": 4.110223642172524, + "grad_norm": 0.08536665886640549, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5146 + }, + { + "epoch": 4.111022364217252, + "grad_norm": 0.1355360597372055, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5147 + }, + { + "epoch": 4.111821086261981, + "grad_norm": 0.13981834053993225, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5148 + }, + { + "epoch": 4.112619808306709, + "grad_norm": 0.12653453648090363, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5149 + }, + { + "epoch": 4.113418530351438, + "grad_norm": 0.06805716454982758, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5150 + }, + { + "epoch": 4.114217252396166, + "grad_norm": 0.14361023902893066, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5151 + }, + { + "epoch": 4.115015974440895, + "grad_norm": 0.15223950147628784, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5152 + }, + { + "epoch": 4.115814696485623, + "grad_norm": 0.10013193637132645, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5153 + }, + { + "epoch": 4.116613418530352, + "grad_norm": 0.21049730479717255, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5154 + }, + { + "epoch": 4.11741214057508, + "grad_norm": 0.1393776834011078, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5155 + }, + { + "epoch": 4.118210862619808, + "grad_norm": 0.08584857732057571, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5156 + }, + { + "epoch": 4.1190095846645365, + "grad_norm": 0.06729432195425034, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5157 + }, + { + "epoch": 4.119808306709265, + "grad_norm": 0.08861853927373886, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5158 + }, + { + "epoch": 4.1206070287539935, + "grad_norm": 0.07037574052810669, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5159 + }, + { + "epoch": 4.121405750798722, + "grad_norm": 0.08049193024635315, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5160 + }, + { + "epoch": 4.122204472843451, + "grad_norm": 0.09040962159633636, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5161 + }, + { + "epoch": 4.123003194888179, + "grad_norm": 0.06531825661659241, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5162 + }, + { + "epoch": 4.123801916932908, + "grad_norm": 0.09423618763685226, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5163 + }, + { + "epoch": 4.124600638977636, + "grad_norm": 0.09436366707086563, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5164 + }, + { + "epoch": 4.125399361022364, + "grad_norm": 0.07543698698282242, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5165 + }, + { + "epoch": 4.126198083067092, + "grad_norm": 0.07491134852170944, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 5166 + }, + { + "epoch": 4.126996805111821, + "grad_norm": 0.09040437638759613, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5167 + }, + { + "epoch": 4.127795527156549, + "grad_norm": 0.11145798116922379, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5168 + }, + { + "epoch": 4.128594249201278, + "grad_norm": 0.35186707973480225, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5169 + }, + { + "epoch": 4.1293929712460065, + "grad_norm": 0.08744635432958603, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5170 + }, + { + "epoch": 4.130191693290735, + "grad_norm": 0.1078719049692154, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5171 + }, + { + "epoch": 4.1309904153354635, + "grad_norm": 0.13568760454654694, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5172 + }, + { + "epoch": 4.131789137380192, + "grad_norm": 0.10629335045814514, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5173 + }, + { + "epoch": 4.13258785942492, + "grad_norm": 0.3467697203159332, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5174 + }, + { + "epoch": 4.133386581469648, + "grad_norm": 0.5514479875564575, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5175 + }, + { + "epoch": 4.134185303514377, + "grad_norm": 0.2762874960899353, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5176 + }, + { + "epoch": 4.134984025559105, + "grad_norm": 0.25959524512290955, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5177 + }, + { + "epoch": 4.135782747603834, + "grad_norm": 0.26429036259651184, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5178 + }, + { + "epoch": 4.136581469648562, + "grad_norm": 0.4492235779762268, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5179 + }, + { + "epoch": 4.137380191693291, + "grad_norm": 0.3261977732181549, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 5180 + }, + { + "epoch": 4.138178913738019, + "grad_norm": 0.15618108212947845, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5181 + }, + { + "epoch": 4.138977635782748, + "grad_norm": 0.2897289991378784, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5182 + }, + { + "epoch": 4.139776357827476, + "grad_norm": 0.2599884271621704, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5183 + }, + { + "epoch": 4.140575079872204, + "grad_norm": 0.3158198893070221, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5184 + }, + { + "epoch": 4.141373801916933, + "grad_norm": 0.2701073884963989, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5185 + }, + { + "epoch": 4.142172523961661, + "grad_norm": 0.14668017625808716, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5186 + }, + { + "epoch": 4.14297124600639, + "grad_norm": 0.14284202456474304, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5187 + }, + { + "epoch": 4.143769968051118, + "grad_norm": 0.1901128888130188, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5188 + }, + { + "epoch": 4.144568690095847, + "grad_norm": 0.17808575928211212, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5189 + }, + { + "epoch": 4.145367412140575, + "grad_norm": 0.11329478025436401, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5190 + }, + { + "epoch": 4.146166134185304, + "grad_norm": 0.10816467553377151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5191 + }, + { + "epoch": 4.146964856230032, + "grad_norm": 0.11593834310770035, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5192 + }, + { + "epoch": 4.147763578274761, + "grad_norm": 0.17315705120563507, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5193 + }, + { + "epoch": 4.1485623003194885, + "grad_norm": 0.10884186625480652, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5194 + }, + { + "epoch": 4.149361022364217, + "grad_norm": 0.17528203129768372, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5195 + }, + { + "epoch": 4.1501597444089455, + "grad_norm": 0.3249641954898834, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5196 + }, + { + "epoch": 4.150958466453674, + "grad_norm": 0.2920859456062317, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5197 + }, + { + "epoch": 4.151757188498403, + "grad_norm": 0.12487918138504028, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5198 + }, + { + "epoch": 4.152555910543131, + "grad_norm": 0.07744348049163818, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5199 + }, + { + "epoch": 4.15335463258786, + "grad_norm": 0.11721999943256378, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5200 + }, + { + "epoch": 4.154153354632588, + "grad_norm": 0.17566390335559845, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5201 + }, + { + "epoch": 4.154952076677317, + "grad_norm": 0.09762726724147797, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5202 + }, + { + "epoch": 4.155750798722044, + "grad_norm": 0.10769844055175781, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5203 + }, + { + "epoch": 4.156549520766773, + "grad_norm": 0.1608363389968872, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5204 + }, + { + "epoch": 4.157348242811501, + "grad_norm": 0.1575978696346283, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5205 + }, + { + "epoch": 4.15814696485623, + "grad_norm": 0.2035059779882431, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5206 + }, + { + "epoch": 4.1589456869009584, + "grad_norm": 0.1405210644006729, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5207 + }, + { + "epoch": 4.159744408945687, + "grad_norm": 0.18898408114910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5208 + }, + { + "epoch": 4.1605431309904155, + "grad_norm": 0.20012563467025757, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5209 + }, + { + "epoch": 4.161341853035144, + "grad_norm": 0.14585568010807037, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5210 + }, + { + "epoch": 4.162140575079873, + "grad_norm": 0.166448175907135, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5211 + }, + { + "epoch": 4.1629392971246, + "grad_norm": 0.08768735080957413, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5212 + }, + { + "epoch": 4.163738019169329, + "grad_norm": 0.12429258227348328, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5213 + }, + { + "epoch": 4.164536741214057, + "grad_norm": 0.06750953942537308, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5214 + }, + { + "epoch": 4.165335463258786, + "grad_norm": 0.10137717425823212, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5215 + }, + { + "epoch": 4.166134185303514, + "grad_norm": 0.1015368178486824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5216 + }, + { + "epoch": 4.166932907348243, + "grad_norm": 0.12396319955587387, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5217 + }, + { + "epoch": 4.167731629392971, + "grad_norm": 0.11295704543590546, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5218 + }, + { + "epoch": 4.1685303514377, + "grad_norm": 0.1415906846523285, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5219 + }, + { + "epoch": 4.169329073482428, + "grad_norm": 0.1300252079963684, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5220 + }, + { + "epoch": 4.170127795527157, + "grad_norm": 0.09486760199069977, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5221 + }, + { + "epoch": 4.170926517571885, + "grad_norm": 0.25776198506355286, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5222 + }, + { + "epoch": 4.171725239616613, + "grad_norm": 0.07684944570064545, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5223 + }, + { + "epoch": 4.172523961661342, + "grad_norm": 0.06909538060426712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5224 + }, + { + "epoch": 4.17332268370607, + "grad_norm": 0.09686419367790222, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5225 + }, + { + "epoch": 4.174121405750799, + "grad_norm": 0.10760180652141571, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5226 + }, + { + "epoch": 4.174920127795527, + "grad_norm": 0.0963902473449707, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5227 + }, + { + "epoch": 4.175718849840256, + "grad_norm": 0.12986192107200623, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5228 + }, + { + "epoch": 4.176517571884984, + "grad_norm": 0.12532354891300201, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5229 + }, + { + "epoch": 4.177316293929713, + "grad_norm": 0.158639058470726, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5230 + }, + { + "epoch": 4.178115015974441, + "grad_norm": 0.10025905817747116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5231 + }, + { + "epoch": 4.178913738019169, + "grad_norm": 0.19150952994823456, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5232 + }, + { + "epoch": 4.1797124600638975, + "grad_norm": 0.10650201886892319, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5233 + }, + { + "epoch": 4.180511182108626, + "grad_norm": 0.08948210626840591, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5234 + }, + { + "epoch": 4.181309904153355, + "grad_norm": 0.144260972738266, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5235 + }, + { + "epoch": 4.182108626198083, + "grad_norm": 0.10631201416254044, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5236 + }, + { + "epoch": 4.182907348242812, + "grad_norm": 0.17884188890457153, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5237 + }, + { + "epoch": 4.18370607028754, + "grad_norm": 0.12393054366111755, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5238 + }, + { + "epoch": 4.184504792332269, + "grad_norm": 0.10113117098808289, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5239 + }, + { + "epoch": 4.185303514376997, + "grad_norm": 0.08745535463094711, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5240 + }, + { + "epoch": 4.186102236421725, + "grad_norm": 0.12319829314947128, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5241 + }, + { + "epoch": 4.186900958466453, + "grad_norm": 0.10202868282794952, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5242 + }, + { + "epoch": 4.187699680511182, + "grad_norm": 0.12799306213855743, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5243 + }, + { + "epoch": 4.18849840255591, + "grad_norm": 0.10247227549552917, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5244 + }, + { + "epoch": 4.189297124600639, + "grad_norm": 0.0876200944185257, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5245 + }, + { + "epoch": 4.1900958466453675, + "grad_norm": 0.08829693496227264, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5246 + }, + { + "epoch": 4.190894568690096, + "grad_norm": 0.09005091339349747, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5247 + }, + { + "epoch": 4.1916932907348246, + "grad_norm": 0.06715424358844757, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5248 + }, + { + "epoch": 4.192492012779553, + "grad_norm": 0.11082255840301514, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5249 + }, + { + "epoch": 4.193290734824281, + "grad_norm": 0.08197743445634842, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5250 + }, + { + "epoch": 4.194089456869009, + "grad_norm": 0.08641887456178665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5251 + }, + { + "epoch": 4.194888178913738, + "grad_norm": 0.29264676570892334, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5252 + }, + { + "epoch": 4.195686900958466, + "grad_norm": 0.10122201591730118, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5253 + }, + { + "epoch": 4.196485623003195, + "grad_norm": 0.13220930099487305, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5254 + }, + { + "epoch": 4.197284345047923, + "grad_norm": 0.05919777229428291, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5255 + }, + { + "epoch": 4.198083067092652, + "grad_norm": 0.15947407484054565, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5256 + }, + { + "epoch": 4.19888178913738, + "grad_norm": 0.08046088367700577, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5257 + }, + { + "epoch": 4.199680511182109, + "grad_norm": 0.08504491299390793, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5258 + }, + { + "epoch": 4.2004792332268375, + "grad_norm": 0.2523876428604126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5259 + }, + { + "epoch": 4.201277955271565, + "grad_norm": 0.32436496019363403, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5260 + }, + { + "epoch": 4.202076677316294, + "grad_norm": 0.3832956552505493, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5261 + }, + { + "epoch": 4.202875399361022, + "grad_norm": 0.15481804311275482, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5262 + }, + { + "epoch": 4.203674121405751, + "grad_norm": 0.5061212182044983, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5263 + }, + { + "epoch": 4.204472843450479, + "grad_norm": 0.2778873145580292, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5264 + }, + { + "epoch": 4.205271565495208, + "grad_norm": 0.10782434046268463, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5265 + }, + { + "epoch": 4.206070287539936, + "grad_norm": 0.2730430066585541, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5266 + }, + { + "epoch": 4.206869009584665, + "grad_norm": 0.14902958273887634, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5267 + }, + { + "epoch": 4.207667731629393, + "grad_norm": 0.2455812245607376, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5268 + }, + { + "epoch": 4.208466453674121, + "grad_norm": 0.36285653710365295, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5269 + }, + { + "epoch": 4.2092651757188495, + "grad_norm": 0.16104358434677124, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5270 + }, + { + "epoch": 4.210063897763578, + "grad_norm": 0.10330995172262192, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5271 + }, + { + "epoch": 4.210862619808307, + "grad_norm": 0.14438849687576294, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5272 + }, + { + "epoch": 4.211661341853035, + "grad_norm": 0.11719724535942078, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5273 + }, + { + "epoch": 4.212460063897764, + "grad_norm": 0.13503463566303253, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5274 + }, + { + "epoch": 4.213258785942492, + "grad_norm": 0.12717710435390472, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5275 + }, + { + "epoch": 4.214057507987221, + "grad_norm": 0.12293769419193268, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5276 + }, + { + "epoch": 4.214856230031949, + "grad_norm": 0.11828786134719849, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5277 + }, + { + "epoch": 4.215654952076678, + "grad_norm": 0.11118468642234802, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5278 + }, + { + "epoch": 4.216453674121405, + "grad_norm": 0.15688025951385498, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5279 + }, + { + "epoch": 4.217252396166134, + "grad_norm": 0.10603991895914078, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5280 + }, + { + "epoch": 4.218051118210862, + "grad_norm": 0.14034971594810486, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5281 + }, + { + "epoch": 4.218849840255591, + "grad_norm": 0.21270571649074554, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5282 + }, + { + "epoch": 4.2196485623003195, + "grad_norm": 0.17699144780635834, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5283 + }, + { + "epoch": 4.220447284345048, + "grad_norm": 0.07665220648050308, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5284 + }, + { + "epoch": 4.2212460063897765, + "grad_norm": 0.13917282223701477, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5285 + }, + { + "epoch": 4.222044728434505, + "grad_norm": 0.1253320872783661, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5286 + }, + { + "epoch": 4.222843450479234, + "grad_norm": 0.07693646103143692, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5287 + }, + { + "epoch": 4.223642172523961, + "grad_norm": 0.11877891421318054, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5288 + }, + { + "epoch": 4.22444089456869, + "grad_norm": 0.08900399506092072, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5289 + }, + { + "epoch": 4.225239616613418, + "grad_norm": 0.08575741946697235, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5290 + }, + { + "epoch": 4.226038338658147, + "grad_norm": 0.11078973859548569, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5291 + }, + { + "epoch": 4.226837060702875, + "grad_norm": 0.12371394783258438, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5292 + }, + { + "epoch": 4.227635782747604, + "grad_norm": 0.11741651594638824, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5293 + }, + { + "epoch": 4.228434504792332, + "grad_norm": 0.1316244751214981, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5294 + }, + { + "epoch": 4.229233226837061, + "grad_norm": 0.07751733064651489, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5295 + }, + { + "epoch": 4.2300319488817895, + "grad_norm": 0.13512739539146423, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5296 + }, + { + "epoch": 4.230830670926518, + "grad_norm": 0.14408327639102936, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5297 + }, + { + "epoch": 4.231629392971246, + "grad_norm": 0.05596759170293808, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5298 + }, + { + "epoch": 4.232428115015974, + "grad_norm": 0.20518198609352112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5299 + }, + { + "epoch": 4.233226837060703, + "grad_norm": 0.17000356316566467, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5300 + }, + { + "epoch": 4.234025559105431, + "grad_norm": 0.10213350504636765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5301 + }, + { + "epoch": 4.23482428115016, + "grad_norm": 0.1633368879556656, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 4.235623003194888, + "grad_norm": 0.17330236732959747, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5303 + }, + { + "epoch": 4.236421725239617, + "grad_norm": 0.20028679072856903, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5304 + }, + { + "epoch": 4.237220447284345, + "grad_norm": 0.23386533558368683, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5305 + }, + { + "epoch": 4.238019169329074, + "grad_norm": 0.051739469170570374, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5306 + }, + { + "epoch": 4.2388178913738015, + "grad_norm": 0.19732257723808289, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5307 + }, + { + "epoch": 4.23961661341853, + "grad_norm": 0.1318890005350113, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5308 + }, + { + "epoch": 4.2404153354632586, + "grad_norm": 0.17188113927841187, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5309 + }, + { + "epoch": 4.241214057507987, + "grad_norm": 0.23981456458568573, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5310 + }, + { + "epoch": 4.242012779552716, + "grad_norm": 0.15658913552761078, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5311 + }, + { + "epoch": 4.242811501597444, + "grad_norm": 0.13481132686138153, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5312 + }, + { + "epoch": 4.243610223642173, + "grad_norm": 0.16327355802059174, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5313 + }, + { + "epoch": 4.244408945686901, + "grad_norm": 0.0873674675822258, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5314 + }, + { + "epoch": 4.24520766773163, + "grad_norm": 0.16612505912780762, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5315 + }, + { + "epoch": 4.246006389776358, + "grad_norm": 0.15376444160938263, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5316 + }, + { + "epoch": 4.246805111821086, + "grad_norm": 0.07853512465953827, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5317 + }, + { + "epoch": 4.247603833865814, + "grad_norm": 0.11799992620944977, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5318 + }, + { + "epoch": 4.248402555910543, + "grad_norm": 0.09121575206518173, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 5319 + }, + { + "epoch": 4.2492012779552715, + "grad_norm": 0.09780153632164001, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5320 + }, + { + "epoch": 4.25, + "grad_norm": 0.11387690156698227, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5321 + }, + { + "epoch": 4.2507987220447285, + "grad_norm": 0.08085697889328003, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5322 + }, + { + "epoch": 4.251597444089457, + "grad_norm": 0.09986089169979095, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5323 + }, + { + "epoch": 4.252396166134186, + "grad_norm": 0.07728606462478638, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5324 + }, + { + "epoch": 4.253194888178914, + "grad_norm": 0.07464555650949478, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5325 + }, + { + "epoch": 4.253993610223642, + "grad_norm": 0.05129759758710861, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5326 + }, + { + "epoch": 4.25479233226837, + "grad_norm": 0.060275599360466, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5327 + }, + { + "epoch": 4.255591054313099, + "grad_norm": 0.07773016393184662, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5328 + }, + { + "epoch": 4.256389776357827, + "grad_norm": 0.1046462282538414, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5329 + }, + { + "epoch": 4.257188498402556, + "grad_norm": 0.1184321865439415, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5330 + }, + { + "epoch": 4.257987220447284, + "grad_norm": 0.1419631987810135, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5331 + }, + { + "epoch": 4.258785942492013, + "grad_norm": 0.10022144019603729, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5332 + }, + { + "epoch": 4.2595846645367414, + "grad_norm": 0.075701504945755, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5333 + }, + { + "epoch": 4.26038338658147, + "grad_norm": 0.18145573139190674, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5334 + }, + { + "epoch": 4.261182108626198, + "grad_norm": 0.06092703342437744, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5335 + }, + { + "epoch": 4.261980830670926, + "grad_norm": 0.13196219503879547, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5336 + }, + { + "epoch": 4.262779552715655, + "grad_norm": 0.17139793932437897, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5337 + }, + { + "epoch": 4.263578274760383, + "grad_norm": 0.12072623521089554, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5338 + }, + { + "epoch": 4.264376996805112, + "grad_norm": 0.11874449253082275, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5339 + }, + { + "epoch": 4.26517571884984, + "grad_norm": 0.10718921571969986, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5340 + }, + { + "epoch": 4.265974440894569, + "grad_norm": 0.07337968051433563, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5341 + }, + { + "epoch": 4.266773162939297, + "grad_norm": 0.11872536689043045, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5342 + }, + { + "epoch": 4.267571884984026, + "grad_norm": 0.11199923604726791, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5343 + }, + { + "epoch": 4.268370607028754, + "grad_norm": 0.05864759162068367, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5344 + }, + { + "epoch": 4.269169329073483, + "grad_norm": 0.14757969975471497, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5345 + }, + { + "epoch": 4.2699680511182105, + "grad_norm": 0.12190169841051102, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5346 + }, + { + "epoch": 4.270766773162939, + "grad_norm": 0.0532461479306221, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5347 + }, + { + "epoch": 4.271565495207668, + "grad_norm": 0.10723208636045456, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5348 + }, + { + "epoch": 4.272364217252396, + "grad_norm": 0.07115229964256287, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5349 + }, + { + "epoch": 4.273162939297125, + "grad_norm": 0.07450878620147705, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 5350 + }, + { + "epoch": 4.273961661341853, + "grad_norm": 0.11793115735054016, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5351 + }, + { + "epoch": 4.274760383386582, + "grad_norm": 0.10440219938755035, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5352 + }, + { + "epoch": 4.27555910543131, + "grad_norm": 0.27991926670074463, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5353 + }, + { + "epoch": 4.276357827476039, + "grad_norm": 0.11090446263551712, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5354 + }, + { + "epoch": 4.277156549520766, + "grad_norm": 0.10509627312421799, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5355 + }, + { + "epoch": 4.277955271565495, + "grad_norm": 0.06217970326542854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5356 + }, + { + "epoch": 4.2787539936102235, + "grad_norm": 0.34369224309921265, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5357 + }, + { + "epoch": 4.279552715654952, + "grad_norm": 0.1246214285492897, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5358 + }, + { + "epoch": 4.2803514376996805, + "grad_norm": 0.06331677734851837, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5359 + }, + { + "epoch": 4.281150159744409, + "grad_norm": 0.08274740725755692, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5360 + }, + { + "epoch": 4.281948881789138, + "grad_norm": 0.06133527308702469, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5361 + }, + { + "epoch": 4.282747603833866, + "grad_norm": 0.09867174178361893, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5362 + }, + { + "epoch": 4.283546325878595, + "grad_norm": 0.09370579570531845, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5363 + }, + { + "epoch": 4.284345047923322, + "grad_norm": 0.2549540400505066, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5364 + }, + { + "epoch": 4.285143769968051, + "grad_norm": 0.1900271773338318, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5365 + }, + { + "epoch": 4.285942492012779, + "grad_norm": 0.21450525522232056, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5366 + }, + { + "epoch": 4.286741214057508, + "grad_norm": 0.1381012350320816, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5367 + }, + { + "epoch": 4.287539936102236, + "grad_norm": 0.0813983827829361, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5368 + }, + { + "epoch": 4.288338658146965, + "grad_norm": 0.16513130068778992, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5369 + }, + { + "epoch": 4.289137380191693, + "grad_norm": 0.10825667530298233, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5370 + }, + { + "epoch": 4.289936102236422, + "grad_norm": 0.07226242125034332, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5371 + }, + { + "epoch": 4.2907348242811505, + "grad_norm": 0.1278400719165802, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5372 + }, + { + "epoch": 4.291533546325878, + "grad_norm": 0.11092592030763626, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5373 + }, + { + "epoch": 4.292332268370607, + "grad_norm": 0.08732229471206665, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5374 + }, + { + "epoch": 4.293130990415335, + "grad_norm": 0.2182341367006302, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5375 + }, + { + "epoch": 4.293929712460064, + "grad_norm": 0.10107403993606567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5376 + }, + { + "epoch": 4.294728434504792, + "grad_norm": 0.13586364686489105, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5377 + }, + { + "epoch": 4.295527156549521, + "grad_norm": 0.3685734272003174, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5378 + }, + { + "epoch": 4.296325878594249, + "grad_norm": 0.13060712814331055, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5379 + }, + { + "epoch": 4.297124600638978, + "grad_norm": 0.05988436937332153, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5380 + }, + { + "epoch": 4.297923322683706, + "grad_norm": 0.14392045140266418, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 5381 + }, + { + "epoch": 4.298722044728435, + "grad_norm": 0.25003254413604736, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5382 + }, + { + "epoch": 4.2995207667731625, + "grad_norm": 0.055451687425374985, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5383 + }, + { + "epoch": 4.300319488817891, + "grad_norm": 0.11186914891004562, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5384 + }, + { + "epoch": 4.30111821086262, + "grad_norm": 0.11314704269170761, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5385 + }, + { + "epoch": 4.301916932907348, + "grad_norm": 0.43445560336112976, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5386 + }, + { + "epoch": 4.302715654952077, + "grad_norm": 0.09362242370843887, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5387 + }, + { + "epoch": 4.303514376996805, + "grad_norm": 0.04405852034687996, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5388 + }, + { + "epoch": 4.304313099041534, + "grad_norm": 0.12615318596363068, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5389 + }, + { + "epoch": 4.305111821086262, + "grad_norm": 0.1067153736948967, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5390 + }, + { + "epoch": 4.305910543130991, + "grad_norm": 0.05732683837413788, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5391 + }, + { + "epoch": 4.306709265175719, + "grad_norm": 0.2452571988105774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5392 + }, + { + "epoch": 4.307507987220447, + "grad_norm": 0.11733133345842361, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5393 + }, + { + "epoch": 4.3083067092651754, + "grad_norm": 0.06771894544363022, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5394 + }, + { + "epoch": 4.309105431309904, + "grad_norm": 0.12928563356399536, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5395 + }, + { + "epoch": 4.3099041533546325, + "grad_norm": 0.1777956187725067, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5396 + }, + { + "epoch": 4.310702875399361, + "grad_norm": 0.1281544715166092, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5397 + }, + { + "epoch": 4.31150159744409, + "grad_norm": 0.07120000571012497, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5398 + }, + { + "epoch": 4.312300319488818, + "grad_norm": 0.1270848512649536, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5399 + }, + { + "epoch": 4.313099041533547, + "grad_norm": 0.17685648798942566, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5400 + }, + { + "epoch": 4.313897763578275, + "grad_norm": 0.05070900544524193, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5401 + }, + { + "epoch": 4.314696485623003, + "grad_norm": 0.10543418675661087, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5402 + }, + { + "epoch": 4.315495207667731, + "grad_norm": 0.12336398661136627, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5403 + }, + { + "epoch": 4.31629392971246, + "grad_norm": 0.1583624631166458, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5404 + }, + { + "epoch": 4.317092651757188, + "grad_norm": 0.08186022192239761, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5405 + }, + { + "epoch": 4.317891373801917, + "grad_norm": 0.07562705129384995, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5406 + }, + { + "epoch": 4.318690095846645, + "grad_norm": 0.05275554209947586, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5407 + }, + { + "epoch": 4.319488817891374, + "grad_norm": 0.06432928144931793, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5408 + }, + { + "epoch": 4.3202875399361025, + "grad_norm": 0.08220377564430237, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5409 + }, + { + "epoch": 4.321086261980831, + "grad_norm": 0.07882758229970932, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5410 + }, + { + "epoch": 4.321884984025559, + "grad_norm": 0.138245090842247, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5411 + }, + { + "epoch": 4.322683706070287, + "grad_norm": 0.1127534806728363, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5412 + }, + { + "epoch": 4.323482428115016, + "grad_norm": 0.1985669732093811, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5413 + }, + { + "epoch": 4.324281150159744, + "grad_norm": 0.08023711293935776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5414 + }, + { + "epoch": 4.325079872204473, + "grad_norm": 0.13853015005588531, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5415 + }, + { + "epoch": 4.325878594249201, + "grad_norm": 0.18319782614707947, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5416 + }, + { + "epoch": 4.32667731629393, + "grad_norm": 0.073015958070755, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5417 + }, + { + "epoch": 4.327476038338658, + "grad_norm": 0.10771846771240234, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5418 + }, + { + "epoch": 4.328274760383387, + "grad_norm": 0.09512028843164444, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5419 + }, + { + "epoch": 4.329073482428115, + "grad_norm": 0.0822201818227768, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5420 + }, + { + "epoch": 4.329872204472843, + "grad_norm": 0.11839213222265244, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5421 + }, + { + "epoch": 4.330670926517572, + "grad_norm": 0.10274796187877655, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5422 + }, + { + "epoch": 4.3314696485623, + "grad_norm": 0.05896717682480812, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5423 + }, + { + "epoch": 4.332268370607029, + "grad_norm": 0.1268780380487442, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5424 + }, + { + "epoch": 4.333067092651757, + "grad_norm": 0.09173188358545303, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5425 + }, + { + "epoch": 4.333865814696486, + "grad_norm": 0.05155360326170921, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5426 + }, + { + "epoch": 4.334664536741214, + "grad_norm": 0.08836793899536133, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 4.335463258785943, + "grad_norm": 0.08620470017194748, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5428 + }, + { + "epoch": 4.336261980830671, + "grad_norm": 0.06972123682498932, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5429 + }, + { + "epoch": 4.3370607028754, + "grad_norm": 0.12461638450622559, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5430 + }, + { + "epoch": 4.337859424920127, + "grad_norm": 0.08546463400125504, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5431 + }, + { + "epoch": 4.338658146964856, + "grad_norm": 0.08495177328586578, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5432 + }, + { + "epoch": 4.3394568690095845, + "grad_norm": 0.13017377257347107, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5433 + }, + { + "epoch": 4.340255591054313, + "grad_norm": 0.13619504868984222, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5434 + }, + { + "epoch": 4.3410543130990416, + "grad_norm": 0.5835675597190857, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5435 + }, + { + "epoch": 4.34185303514377, + "grad_norm": 0.09355206042528152, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5436 + }, + { + "epoch": 4.342651757188499, + "grad_norm": 0.08626751601696014, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5437 + }, + { + "epoch": 4.343450479233227, + "grad_norm": 0.05652647092938423, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5438 + }, + { + "epoch": 4.344249201277956, + "grad_norm": 0.05232316255569458, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5439 + }, + { + "epoch": 4.345047923322683, + "grad_norm": 0.08115233480930328, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5440 + }, + { + "epoch": 4.345846645367412, + "grad_norm": 0.08757120370864868, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5441 + }, + { + "epoch": 4.34664536741214, + "grad_norm": 0.046224139630794525, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5442 + }, + { + "epoch": 4.347444089456869, + "grad_norm": 0.07967934757471085, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5443 + }, + { + "epoch": 4.348242811501597, + "grad_norm": 0.044298652559518814, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5444 + }, + { + "epoch": 4.349041533546326, + "grad_norm": 0.09021158516407013, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5445 + }, + { + "epoch": 4.3498402555910545, + "grad_norm": 0.12857890129089355, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5446 + }, + { + "epoch": 4.350638977635783, + "grad_norm": 0.05655589699745178, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5447 + }, + { + "epoch": 4.3514376996805115, + "grad_norm": 0.09304624050855637, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5448 + }, + { + "epoch": 4.352236421725239, + "grad_norm": 0.19815632700920105, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5449 + }, + { + "epoch": 4.353035143769968, + "grad_norm": 0.0526299886405468, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5450 + }, + { + "epoch": 4.353833865814696, + "grad_norm": 0.06432242691516876, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5451 + }, + { + "epoch": 4.354632587859425, + "grad_norm": 0.07848794758319855, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5452 + }, + { + "epoch": 4.355431309904153, + "grad_norm": 0.08260536193847656, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5453 + }, + { + "epoch": 4.356230031948882, + "grad_norm": 0.052810169756412506, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5454 + }, + { + "epoch": 4.35702875399361, + "grad_norm": 0.06942226737737656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5455 + }, + { + "epoch": 4.357827476038339, + "grad_norm": 0.13892871141433716, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5456 + }, + { + "epoch": 4.358626198083067, + "grad_norm": 0.15982909500598907, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5457 + }, + { + "epoch": 4.359424920127796, + "grad_norm": 0.08206653594970703, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5458 + }, + { + "epoch": 4.360223642172524, + "grad_norm": 0.08957790583372116, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5459 + }, + { + "epoch": 4.361022364217252, + "grad_norm": 0.03882770985364914, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5460 + }, + { + "epoch": 4.361821086261981, + "grad_norm": 0.0928555279970169, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5461 + }, + { + "epoch": 4.362619808306709, + "grad_norm": 0.057321447879076004, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5462 + }, + { + "epoch": 4.363418530351438, + "grad_norm": 0.0737103596329689, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5463 + }, + { + "epoch": 4.364217252396166, + "grad_norm": 0.06696293503046036, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5464 + }, + { + "epoch": 4.365015974440895, + "grad_norm": 0.04572489857673645, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5465 + }, + { + "epoch": 4.365814696485623, + "grad_norm": 0.094516322016716, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5466 + }, + { + "epoch": 4.366613418530352, + "grad_norm": 0.045576825737953186, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5467 + }, + { + "epoch": 4.36741214057508, + "grad_norm": 0.06839725375175476, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5468 + }, + { + "epoch": 4.368210862619808, + "grad_norm": 0.14465193450450897, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5469 + }, + { + "epoch": 4.3690095846645365, + "grad_norm": 0.07930073887109756, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5470 + }, + { + "epoch": 4.369808306709265, + "grad_norm": 0.06120619550347328, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5471 + }, + { + "epoch": 4.3706070287539935, + "grad_norm": 0.066256083548069, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5472 + }, + { + "epoch": 4.371405750798722, + "grad_norm": 0.11696353554725647, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5473 + }, + { + "epoch": 4.372204472843451, + "grad_norm": 0.11530395597219467, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5474 + }, + { + "epoch": 4.373003194888179, + "grad_norm": 0.05663579702377319, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5475 + }, + { + "epoch": 4.373801916932908, + "grad_norm": 0.1241946592926979, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5476 + }, + { + "epoch": 4.374600638977636, + "grad_norm": 0.1725323498249054, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5477 + }, + { + "epoch": 4.375399361022364, + "grad_norm": 0.09785371273756027, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5478 + }, + { + "epoch": 4.376198083067092, + "grad_norm": 0.0813792496919632, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5479 + }, + { + "epoch": 4.376996805111821, + "grad_norm": 0.17471592128276825, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5480 + }, + { + "epoch": 4.377795527156549, + "grad_norm": 0.1923220455646515, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5481 + }, + { + "epoch": 4.378594249201278, + "grad_norm": 0.09857932478189468, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5482 + }, + { + "epoch": 4.3793929712460065, + "grad_norm": 0.10073419660329819, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5483 + }, + { + "epoch": 4.380191693290735, + "grad_norm": 0.35731273889541626, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5484 + }, + { + "epoch": 4.3809904153354635, + "grad_norm": 0.12060656398534775, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5485 + }, + { + "epoch": 4.381789137380192, + "grad_norm": 0.10264381766319275, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5486 + }, + { + "epoch": 4.38258785942492, + "grad_norm": 0.0868317037820816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5487 + }, + { + "epoch": 4.383386581469648, + "grad_norm": 0.07722344994544983, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5488 + }, + { + "epoch": 4.384185303514377, + "grad_norm": 0.3690173327922821, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5489 + }, + { + "epoch": 4.384984025559105, + "grad_norm": 0.18400169909000397, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5490 + }, + { + "epoch": 4.385782747603834, + "grad_norm": 0.14671844244003296, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5491 + }, + { + "epoch": 4.386581469648562, + "grad_norm": 0.05277179554104805, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5492 + }, + { + "epoch": 4.387380191693291, + "grad_norm": 0.13593660295009613, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5493 + }, + { + "epoch": 4.388178913738019, + "grad_norm": 0.1318334937095642, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5494 + }, + { + "epoch": 4.388977635782748, + "grad_norm": 0.07189908623695374, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5495 + }, + { + "epoch": 4.389776357827476, + "grad_norm": 0.07969736307859421, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5496 + }, + { + "epoch": 4.390575079872204, + "grad_norm": 0.07449150085449219, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5497 + }, + { + "epoch": 4.391373801916933, + "grad_norm": 0.533295214176178, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5498 + }, + { + "epoch": 4.392172523961661, + "grad_norm": 0.10412111133337021, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5499 + }, + { + "epoch": 4.39297124600639, + "grad_norm": 0.08482066541910172, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5500 + }, + { + "epoch": 4.393769968051118, + "grad_norm": 0.08023949712514877, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5501 + }, + { + "epoch": 4.394568690095847, + "grad_norm": 0.16967490315437317, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5502 + }, + { + "epoch": 4.395367412140575, + "grad_norm": 0.1979716271162033, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5503 + }, + { + "epoch": 4.396166134185304, + "grad_norm": 0.09058263152837753, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5504 + }, + { + "epoch": 4.396964856230032, + "grad_norm": 0.13149574398994446, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5505 + }, + { + "epoch": 4.397763578274761, + "grad_norm": 0.08240146189928055, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5506 + }, + { + "epoch": 4.3985623003194885, + "grad_norm": 0.13789936900138855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5507 + }, + { + "epoch": 4.399361022364217, + "grad_norm": 0.18576087057590485, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5508 + }, + { + "epoch": 4.4001597444089455, + "grad_norm": 0.13780297338962555, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5509 + }, + { + "epoch": 4.400958466453674, + "grad_norm": 0.14724896848201752, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5510 + }, + { + "epoch": 4.401757188498403, + "grad_norm": 0.20418551564216614, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5511 + }, + { + "epoch": 4.402555910543131, + "grad_norm": 0.1841040551662445, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5512 + }, + { + "epoch": 4.40335463258786, + "grad_norm": 0.6994684338569641, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5513 + }, + { + "epoch": 4.404153354632588, + "grad_norm": 0.18882393836975098, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5514 + }, + { + "epoch": 4.404952076677317, + "grad_norm": 0.07170864939689636, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5515 + }, + { + "epoch": 4.405750798722044, + "grad_norm": 0.04765893518924713, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5516 + }, + { + "epoch": 4.406549520766773, + "grad_norm": 0.07294443249702454, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5517 + }, + { + "epoch": 4.407348242811501, + "grad_norm": 0.18566831946372986, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 5518 + }, + { + "epoch": 4.40814696485623, + "grad_norm": 0.10881441831588745, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5519 + }, + { + "epoch": 4.4089456869009584, + "grad_norm": 0.380438894033432, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5520 + }, + { + "epoch": 4.409744408945687, + "grad_norm": 0.19281962513923645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5521 + }, + { + "epoch": 4.4105431309904155, + "grad_norm": 0.05730361491441727, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5522 + }, + { + "epoch": 4.411341853035144, + "grad_norm": 0.09276643395423889, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5523 + }, + { + "epoch": 4.412140575079873, + "grad_norm": 0.070807084441185, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5524 + }, + { + "epoch": 4.4129392971246, + "grad_norm": 0.08902080357074738, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5525 + }, + { + "epoch": 4.413738019169329, + "grad_norm": 0.14861932396888733, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5526 + }, + { + "epoch": 4.414536741214057, + "grad_norm": 0.2678995728492737, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5527 + }, + { + "epoch": 4.415335463258786, + "grad_norm": 0.12902382016181946, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5528 + }, + { + "epoch": 4.416134185303514, + "grad_norm": 0.14999063313007355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5529 + }, + { + "epoch": 4.416932907348243, + "grad_norm": 0.13950730860233307, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5530 + }, + { + "epoch": 4.417731629392971, + "grad_norm": 0.12215374410152435, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5531 + }, + { + "epoch": 4.4185303514377, + "grad_norm": 0.12941284477710724, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5532 + }, + { + "epoch": 4.419329073482428, + "grad_norm": 0.22524291276931763, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5533 + }, + { + "epoch": 4.420127795527157, + "grad_norm": 0.0830528736114502, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5534 + }, + { + "epoch": 4.420926517571885, + "grad_norm": 0.1562981903553009, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5535 + }, + { + "epoch": 4.421725239616613, + "grad_norm": 0.19052654504776, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5536 + }, + { + "epoch": 4.422523961661342, + "grad_norm": 0.12264347821474075, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5537 + }, + { + "epoch": 4.42332268370607, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5538 + }, + { + "epoch": 4.424121405750799, + "grad_norm": 0.1412813812494278, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5539 + }, + { + "epoch": 4.424920127795527, + "grad_norm": 0.17808450758457184, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5540 + }, + { + "epoch": 4.425718849840256, + "grad_norm": 0.43806061148643494, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5541 + }, + { + "epoch": 4.426517571884984, + "grad_norm": 0.17728228867053986, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5542 + }, + { + "epoch": 4.427316293929713, + "grad_norm": 0.12434227764606476, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5543 + }, + { + "epoch": 4.428115015974441, + "grad_norm": 0.10051420331001282, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5544 + }, + { + "epoch": 4.428913738019169, + "grad_norm": 0.0943203940987587, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5545 + }, + { + "epoch": 4.4297124600638975, + "grad_norm": 0.08082996308803558, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5546 + }, + { + "epoch": 4.430511182108626, + "grad_norm": 0.13405202329158783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5547 + }, + { + "epoch": 4.431309904153355, + "grad_norm": 0.10448389500379562, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5548 + }, + { + "epoch": 4.432108626198083, + "grad_norm": 0.32405009865760803, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5549 + }, + { + "epoch": 4.432907348242812, + "grad_norm": 0.09690065681934357, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5550 + }, + { + "epoch": 4.43370607028754, + "grad_norm": 0.35410076379776, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5551 + }, + { + "epoch": 4.434504792332269, + "grad_norm": 0.17826306819915771, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5552 + }, + { + "epoch": 4.435303514376997, + "grad_norm": 0.2252579778432846, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5553 + }, + { + "epoch": 4.436102236421725, + "grad_norm": 0.09508918970823288, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5554 + }, + { + "epoch": 4.436900958466453, + "grad_norm": 0.16872358322143555, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5555 + }, + { + "epoch": 4.437699680511182, + "grad_norm": 0.24836355447769165, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5556 + }, + { + "epoch": 4.43849840255591, + "grad_norm": 0.20887835323810577, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5557 + }, + { + "epoch": 4.439297124600639, + "grad_norm": 0.10922685265541077, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5558 + }, + { + "epoch": 4.4400958466453675, + "grad_norm": 0.44561028480529785, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5559 + }, + { + "epoch": 4.440894568690096, + "grad_norm": 0.18160179257392883, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5560 + }, + { + "epoch": 4.4416932907348246, + "grad_norm": 0.06924877315759659, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5561 + }, + { + "epoch": 4.442492012779553, + "grad_norm": 0.15605933964252472, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5562 + }, + { + "epoch": 4.443290734824281, + "grad_norm": 0.10880772024393082, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5563 + }, + { + "epoch": 4.444089456869009, + "grad_norm": 0.1252668797969818, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5564 + }, + { + "epoch": 4.444888178913738, + "grad_norm": 0.20452634990215302, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5565 + }, + { + "epoch": 4.445686900958466, + "grad_norm": 0.20973001420497894, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5566 + }, + { + "epoch": 4.446485623003195, + "grad_norm": 0.07631060481071472, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5567 + }, + { + "epoch": 4.447284345047923, + "grad_norm": 0.14793622493743896, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5568 + }, + { + "epoch": 4.448083067092652, + "grad_norm": 0.30125850439071655, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5569 + }, + { + "epoch": 4.44888178913738, + "grad_norm": 0.1291274130344391, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5570 + }, + { + "epoch": 4.449680511182109, + "grad_norm": 0.08679793030023575, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5571 + }, + { + "epoch": 4.4504792332268375, + "grad_norm": 0.11555953323841095, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5572 + }, + { + "epoch": 4.451277955271565, + "grad_norm": 0.10711846500635147, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5573 + }, + { + "epoch": 4.452076677316294, + "grad_norm": 0.0604897104203701, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5574 + }, + { + "epoch": 4.452875399361022, + "grad_norm": 0.08729933202266693, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5575 + }, + { + "epoch": 4.453674121405751, + "grad_norm": 0.09586715698242188, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5576 + }, + { + "epoch": 4.454472843450479, + "grad_norm": 0.11635993421077728, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5577 + }, + { + "epoch": 4.455271565495208, + "grad_norm": 0.12405801564455032, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5578 + }, + { + "epoch": 4.456070287539936, + "grad_norm": 0.1284986287355423, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5579 + }, + { + "epoch": 4.456869009584665, + "grad_norm": 0.09059973061084747, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5580 + }, + { + "epoch": 4.457667731629393, + "grad_norm": 0.08497101068496704, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5581 + }, + { + "epoch": 4.458466453674122, + "grad_norm": 0.10315481573343277, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5582 + }, + { + "epoch": 4.4592651757188495, + "grad_norm": 0.09923984855413437, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5583 + }, + { + "epoch": 4.460063897763578, + "grad_norm": 0.09179794788360596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5584 + }, + { + "epoch": 4.460862619808307, + "grad_norm": 0.0783005952835083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5585 + }, + { + "epoch": 4.461661341853035, + "grad_norm": 0.4005993604660034, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5586 + }, + { + "epoch": 4.462460063897764, + "grad_norm": 0.09382215887308121, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5587 + }, + { + "epoch": 4.463258785942492, + "grad_norm": 0.10208452492952347, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5588 + }, + { + "epoch": 4.464057507987221, + "grad_norm": 0.08237040042877197, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5589 + }, + { + "epoch": 4.464856230031949, + "grad_norm": 0.07287969440221786, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 5590 + }, + { + "epoch": 4.465654952076678, + "grad_norm": 0.07156763970851898, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5591 + }, + { + "epoch": 4.466453674121405, + "grad_norm": 0.11347219347953796, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5592 + }, + { + "epoch": 4.467252396166134, + "grad_norm": 0.13722039759159088, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5593 + }, + { + "epoch": 4.468051118210862, + "grad_norm": 0.20186153054237366, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5594 + }, + { + "epoch": 4.468849840255591, + "grad_norm": 0.1548159420490265, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5595 + }, + { + "epoch": 4.4696485623003195, + "grad_norm": 0.08960088342428207, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5596 + }, + { + "epoch": 4.470447284345048, + "grad_norm": 0.23552097380161285, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5597 + }, + { + "epoch": 4.4712460063897765, + "grad_norm": 0.34478914737701416, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5598 + }, + { + "epoch": 4.472044728434505, + "grad_norm": 0.219953253865242, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5599 + }, + { + "epoch": 4.472843450479234, + "grad_norm": 0.13104191422462463, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5600 + }, + { + "epoch": 4.473642172523961, + "grad_norm": 0.2867056131362915, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5601 + }, + { + "epoch": 4.47444089456869, + "grad_norm": 0.15794725716114044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5602 + }, + { + "epoch": 4.475239616613418, + "grad_norm": 0.10884165018796921, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5603 + }, + { + "epoch": 4.476038338658147, + "grad_norm": 1.0521267652511597, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5604 + }, + { + "epoch": 4.476837060702875, + "grad_norm": 0.07823536545038223, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5605 + }, + { + "epoch": 4.477635782747604, + "grad_norm": 0.1536101996898651, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5606 + }, + { + "epoch": 4.478434504792332, + "grad_norm": 0.1379251778125763, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5607 + }, + { + "epoch": 4.479233226837061, + "grad_norm": 0.06181122735142708, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5608 + }, + { + "epoch": 4.4800319488817895, + "grad_norm": 0.1701904535293579, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5609 + }, + { + "epoch": 4.480830670926517, + "grad_norm": 0.1322227120399475, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5610 + }, + { + "epoch": 4.481629392971246, + "grad_norm": 0.09158491343259811, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5611 + }, + { + "epoch": 4.482428115015974, + "grad_norm": 0.09851136803627014, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5612 + }, + { + "epoch": 4.483226837060703, + "grad_norm": 0.09350419789552689, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5613 + }, + { + "epoch": 4.484025559105431, + "grad_norm": 0.40614885091781616, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5614 + }, + { + "epoch": 4.48482428115016, + "grad_norm": 0.1653166264295578, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5615 + }, + { + "epoch": 4.485623003194888, + "grad_norm": 0.13429352641105652, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5616 + }, + { + "epoch": 4.486421725239617, + "grad_norm": 0.09340473264455795, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5617 + }, + { + "epoch": 4.487220447284345, + "grad_norm": 0.1621188223361969, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5618 + }, + { + "epoch": 4.488019169329074, + "grad_norm": 0.18538816273212433, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5619 + }, + { + "epoch": 4.488817891373802, + "grad_norm": 0.26981350779533386, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5620 + }, + { + "epoch": 4.48961661341853, + "grad_norm": 0.28865110874176025, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5621 + }, + { + "epoch": 4.4904153354632586, + "grad_norm": 0.23013874888420105, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5622 + }, + { + "epoch": 4.491214057507987, + "grad_norm": 0.08305853605270386, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5623 + }, + { + "epoch": 4.492012779552716, + "grad_norm": 0.1810445487499237, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5624 + }, + { + "epoch": 4.492811501597444, + "grad_norm": 0.23000332713127136, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5625 + }, + { + "epoch": 4.493610223642173, + "grad_norm": 0.06753652542829514, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5626 + }, + { + "epoch": 4.494408945686901, + "grad_norm": 0.19956068694591522, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5627 + }, + { + "epoch": 4.49520766773163, + "grad_norm": 0.24572248756885529, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5628 + }, + { + "epoch": 4.496006389776358, + "grad_norm": 0.06617605686187744, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5629 + }, + { + "epoch": 4.496805111821086, + "grad_norm": 0.18551495671272278, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.497603833865814, + "grad_norm": 0.16827648878097534, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5631 + }, + { + "epoch": 4.498402555910543, + "grad_norm": 0.13273993134498596, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5632 + }, + { + "epoch": 4.4992012779552715, + "grad_norm": 0.24461479485034943, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5633 + }, + { + "epoch": 4.5, + "grad_norm": 0.2016836553812027, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5634 + }, + { + "epoch": 4.5007987220447285, + "grad_norm": 0.07513006776571274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5635 + }, + { + "epoch": 4.501597444089457, + "grad_norm": 0.1701919138431549, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5636 + }, + { + "epoch": 4.502396166134186, + "grad_norm": 0.12785466015338898, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5637 + }, + { + "epoch": 4.503194888178914, + "grad_norm": 0.1135641485452652, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5638 + }, + { + "epoch": 4.503993610223642, + "grad_norm": 0.5004979372024536, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5639 + }, + { + "epoch": 4.50479233226837, + "grad_norm": 0.28730812668800354, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5640 + }, + { + "epoch": 4.505591054313099, + "grad_norm": 0.3666481673717499, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5641 + }, + { + "epoch": 4.506389776357827, + "grad_norm": 0.257710337638855, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5642 + }, + { + "epoch": 4.507188498402556, + "grad_norm": 0.20071941614151, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5643 + }, + { + "epoch": 4.507987220447284, + "grad_norm": 0.3445729613304138, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5644 + }, + { + "epoch": 4.508785942492013, + "grad_norm": 0.20297282934188843, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5645 + }, + { + "epoch": 4.5095846645367414, + "grad_norm": 0.1889636069536209, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5646 + }, + { + "epoch": 4.51038338658147, + "grad_norm": 0.2153794765472412, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5647 + }, + { + "epoch": 4.511182108626198, + "grad_norm": 0.15353621542453766, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5648 + }, + { + "epoch": 4.511980830670926, + "grad_norm": 0.1575399786233902, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5649 + }, + { + "epoch": 4.512779552715655, + "grad_norm": 0.5555608868598938, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5650 + }, + { + "epoch": 4.513578274760383, + "grad_norm": 0.26887524127960205, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5651 + }, + { + "epoch": 4.514376996805112, + "grad_norm": 0.11516866087913513, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5652 + }, + { + "epoch": 4.51517571884984, + "grad_norm": 0.19820965826511383, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5653 + }, + { + "epoch": 4.515974440894569, + "grad_norm": 0.2122081071138382, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5654 + }, + { + "epoch": 4.516773162939297, + "grad_norm": 0.10736703872680664, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5655 + }, + { + "epoch": 4.517571884984026, + "grad_norm": 0.09852312505245209, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5656 + }, + { + "epoch": 4.518370607028754, + "grad_norm": 0.07539162784814835, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5657 + }, + { + "epoch": 4.519169329073483, + "grad_norm": 0.07467353343963623, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5658 + }, + { + "epoch": 4.5199680511182105, + "grad_norm": 0.09987884759902954, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5659 + }, + { + "epoch": 4.520766773162939, + "grad_norm": 0.08720221370458603, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5660 + }, + { + "epoch": 4.521565495207668, + "grad_norm": 0.07798969000577927, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5661 + }, + { + "epoch": 4.522364217252396, + "grad_norm": 0.12410122901201248, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5662 + }, + { + "epoch": 4.523162939297125, + "grad_norm": 0.07746852934360504, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5663 + }, + { + "epoch": 4.523961661341853, + "grad_norm": 0.09171058982610703, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5664 + }, + { + "epoch": 4.524760383386582, + "grad_norm": 0.8176944255828857, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5665 + }, + { + "epoch": 4.52555910543131, + "grad_norm": 0.4282614290714264, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5666 + }, + { + "epoch": 4.526357827476039, + "grad_norm": 0.35193827748298645, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5667 + }, + { + "epoch": 4.527156549520766, + "grad_norm": 0.15641339123249054, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5668 + }, + { + "epoch": 4.527955271565495, + "grad_norm": 0.31442952156066895, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5669 + }, + { + "epoch": 4.5287539936102235, + "grad_norm": 0.3205500841140747, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5670 + }, + { + "epoch": 4.529552715654952, + "grad_norm": 0.2866390645503998, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5671 + }, + { + "epoch": 4.5303514376996805, + "grad_norm": 0.21028868854045868, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5672 + }, + { + "epoch": 4.531150159744409, + "grad_norm": 0.32687097787857056, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5673 + }, + { + "epoch": 4.531948881789138, + "grad_norm": 0.25662627816200256, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5674 + }, + { + "epoch": 4.532747603833866, + "grad_norm": 0.10192561894655228, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5675 + }, + { + "epoch": 4.533546325878595, + "grad_norm": 0.8102573752403259, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5676 + }, + { + "epoch": 4.534345047923322, + "grad_norm": 0.19127781689167023, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5677 + }, + { + "epoch": 4.535143769968051, + "grad_norm": 0.22435548901557922, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5678 + }, + { + "epoch": 4.535942492012779, + "grad_norm": 0.3271692395210266, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5679 + }, + { + "epoch": 4.536741214057508, + "grad_norm": 0.17226184904575348, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5680 + }, + { + "epoch": 4.537539936102236, + "grad_norm": 0.16628077626228333, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5681 + }, + { + "epoch": 4.538338658146965, + "grad_norm": 0.6196639537811279, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5682 + }, + { + "epoch": 4.539137380191693, + "grad_norm": 0.21590936183929443, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5683 + }, + { + "epoch": 4.539936102236422, + "grad_norm": 0.16313950717449188, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5684 + }, + { + "epoch": 4.5407348242811505, + "grad_norm": 0.12859022617340088, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5685 + }, + { + "epoch": 4.541533546325878, + "grad_norm": 0.1189458817243576, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5686 + }, + { + "epoch": 4.542332268370607, + "grad_norm": 6.769774913787842, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5687 + }, + { + "epoch": 4.543130990415335, + "grad_norm": 0.20253166556358337, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5688 + }, + { + "epoch": 4.543929712460064, + "grad_norm": 0.11631135642528534, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5689 + }, + { + "epoch": 4.544728434504792, + "grad_norm": 0.1848360300064087, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5690 + }, + { + "epoch": 4.545527156549521, + "grad_norm": 0.17804184556007385, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 5691 + }, + { + "epoch": 4.546325878594249, + "grad_norm": 0.2214183509349823, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5692 + }, + { + "epoch": 4.547124600638978, + "grad_norm": 16.448396682739258, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5693 + }, + { + "epoch": 4.547923322683706, + "grad_norm": 0.4933917224407196, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 5694 + }, + { + "epoch": 4.548722044728435, + "grad_norm": 0.41254448890686035, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 5695 + }, + { + "epoch": 4.549520766773163, + "grad_norm": 0.28898510336875916, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 5696 + }, + { + "epoch": 4.550319488817891, + "grad_norm": 0.2938457727432251, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5697 + }, + { + "epoch": 4.55111821086262, + "grad_norm": 0.2264672964811325, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5698 + }, + { + "epoch": 4.551916932907348, + "grad_norm": 0.12931588292121887, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5699 + }, + { + "epoch": 4.552715654952077, + "grad_norm": 0.22106601297855377, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5700 + }, + { + "epoch": 4.553514376996805, + "grad_norm": 0.31875962018966675, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5701 + }, + { + "epoch": 4.554313099041534, + "grad_norm": 0.3129211962223053, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5702 + }, + { + "epoch": 4.555111821086262, + "grad_norm": 0.1613578200340271, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5703 + }, + { + "epoch": 4.555910543130991, + "grad_norm": 0.6340786814689636, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 5704 + }, + { + "epoch": 4.556709265175719, + "grad_norm": 0.13203595578670502, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5705 + }, + { + "epoch": 4.557507987220447, + "grad_norm": 0.16561077535152435, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5706 + }, + { + "epoch": 4.5583067092651754, + "grad_norm": 0.17777414619922638, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 5707 + }, + { + "epoch": 4.559105431309904, + "grad_norm": 0.6985258460044861, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 5708 + }, + { + "epoch": 4.5599041533546325, + "grad_norm": 0.18673790991306305, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5709 + }, + { + "epoch": 4.560702875399361, + "grad_norm": 0.10636870563030243, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5710 + }, + { + "epoch": 4.56150159744409, + "grad_norm": 0.1719052493572235, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5711 + }, + { + "epoch": 4.562300319488818, + "grad_norm": 0.7030455470085144, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5712 + }, + { + "epoch": 4.563099041533547, + "grad_norm": 0.1482628434896469, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 5713 + }, + { + "epoch": 4.563897763578275, + "grad_norm": 0.1585852950811386, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5714 + }, + { + "epoch": 4.564696485623003, + "grad_norm": 0.16067056357860565, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5715 + }, + { + "epoch": 4.565495207667731, + "grad_norm": 0.16162389516830444, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5716 + }, + { + "epoch": 4.56629392971246, + "grad_norm": 0.07224202156066895, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5717 + }, + { + "epoch": 4.567092651757188, + "grad_norm": 0.2577751576900482, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5718 + }, + { + "epoch": 4.567891373801917, + "grad_norm": 1.676942229270935, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5719 + }, + { + "epoch": 4.568690095846645, + "grad_norm": 0.11058419197797775, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5720 + }, + { + "epoch": 4.569488817891374, + "grad_norm": 0.23155376315116882, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5721 + }, + { + "epoch": 4.5702875399361025, + "grad_norm": 0.1197747215628624, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5722 + }, + { + "epoch": 4.571086261980831, + "grad_norm": 0.5179840326309204, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5723 + }, + { + "epoch": 4.571884984025559, + "grad_norm": 0.17717961966991425, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5724 + }, + { + "epoch": 4.572683706070287, + "grad_norm": 0.1513422429561615, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5725 + }, + { + "epoch": 4.573482428115016, + "grad_norm": 0.15495018661022186, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5726 + }, + { + "epoch": 4.574281150159744, + "grad_norm": 3.4248743057250977, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5727 + }, + { + "epoch": 4.575079872204473, + "grad_norm": 0.29529228806495667, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5728 + }, + { + "epoch": 4.575878594249201, + "grad_norm": 0.21125876903533936, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 5729 + }, + { + "epoch": 4.57667731629393, + "grad_norm": 0.16381484270095825, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5730 + }, + { + "epoch": 4.577476038338658, + "grad_norm": 0.2144167572259903, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 5731 + }, + { + "epoch": 4.578274760383387, + "grad_norm": 0.1564428210258484, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5732 + }, + { + "epoch": 4.5790734824281145, + "grad_norm": 0.21137529611587524, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5733 + }, + { + "epoch": 4.579872204472844, + "grad_norm": 0.13836248219013214, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5734 + }, + { + "epoch": 4.580670926517572, + "grad_norm": 0.11749537289142609, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5735 + }, + { + "epoch": 4.5814696485623, + "grad_norm": 0.10901704430580139, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5736 + }, + { + "epoch": 4.582268370607029, + "grad_norm": 0.08402425795793533, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5737 + }, + { + "epoch": 4.583067092651757, + "grad_norm": 0.1502164900302887, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5738 + }, + { + "epoch": 4.583865814696486, + "grad_norm": 0.10606876760721207, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5739 + }, + { + "epoch": 4.584664536741214, + "grad_norm": 0.11868279427289963, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5740 + }, + { + "epoch": 4.585463258785943, + "grad_norm": 0.10678767412900925, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5741 + }, + { + "epoch": 4.586261980830671, + "grad_norm": 0.28886285424232483, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5742 + }, + { + "epoch": 4.5870607028754, + "grad_norm": 0.3516097366809845, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5743 + }, + { + "epoch": 4.587859424920127, + "grad_norm": 0.10221854597330093, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5744 + }, + { + "epoch": 4.588658146964856, + "grad_norm": 0.24786177277565002, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5745 + }, + { + "epoch": 4.5894568690095845, + "grad_norm": 0.10537181794643402, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5746 + }, + { + "epoch": 4.590255591054313, + "grad_norm": 0.23574885725975037, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5747 + }, + { + "epoch": 4.5910543130990416, + "grad_norm": 0.1483563631772995, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5748 + }, + { + "epoch": 4.59185303514377, + "grad_norm": 0.1516815721988678, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 5749 + }, + { + "epoch": 4.592651757188499, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5750 + }, + { + "epoch": 4.593450479233227, + "grad_norm": 0.10706239938735962, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5751 + }, + { + "epoch": 4.594249201277956, + "grad_norm": 1.081868290901184, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5752 + }, + { + "epoch": 4.595047923322683, + "grad_norm": 0.4016919732093811, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5753 + }, + { + "epoch": 4.595846645367412, + "grad_norm": 0.3266371786594391, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5754 + }, + { + "epoch": 4.59664536741214, + "grad_norm": 0.23380769789218903, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5755 + }, + { + "epoch": 4.597444089456869, + "grad_norm": 0.2521349787712097, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 5756 + }, + { + "epoch": 4.598242811501597, + "grad_norm": 0.2223331481218338, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5757 + }, + { + "epoch": 4.599041533546326, + "grad_norm": 0.177442729473114, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 5758 + }, + { + "epoch": 4.5998402555910545, + "grad_norm": 0.18474844098091125, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 5759 + }, + { + "epoch": 4.600638977635783, + "grad_norm": 0.1686495542526245, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5760 + }, + { + "epoch": 4.6014376996805115, + "grad_norm": 0.13674414157867432, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5761 + }, + { + "epoch": 4.602236421725239, + "grad_norm": 0.1390203833580017, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 5762 + }, + { + "epoch": 4.603035143769968, + "grad_norm": 0.10701096057891846, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5763 + }, + { + "epoch": 4.603833865814696, + "grad_norm": 0.110149085521698, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5764 + }, + { + "epoch": 4.604632587859425, + "grad_norm": 0.2477579116821289, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5765 + }, + { + "epoch": 4.605431309904153, + "grad_norm": 0.2554718852043152, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5766 + }, + { + "epoch": 4.606230031948882, + "grad_norm": 0.1945963203907013, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5767 + }, + { + "epoch": 4.60702875399361, + "grad_norm": 0.26785531640052795, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5768 + }, + { + "epoch": 4.607827476038339, + "grad_norm": 0.3007332980632782, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5769 + }, + { + "epoch": 4.608626198083067, + "grad_norm": 0.09973788261413574, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5770 + }, + { + "epoch": 4.609424920127795, + "grad_norm": 0.09176181256771088, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5771 + }, + { + "epoch": 4.6102236421725244, + "grad_norm": 0.1395607590675354, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5772 + }, + { + "epoch": 4.611022364217252, + "grad_norm": 0.8938566446304321, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5773 + }, + { + "epoch": 4.611821086261981, + "grad_norm": 0.3093889653682709, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5774 + }, + { + "epoch": 4.612619808306709, + "grad_norm": 0.1910911351442337, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5775 + }, + { + "epoch": 4.613418530351438, + "grad_norm": 0.11586496978998184, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5776 + }, + { + "epoch": 4.614217252396166, + "grad_norm": 0.222470223903656, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 5777 + }, + { + "epoch": 4.615015974440895, + "grad_norm": 0.16580955684185028, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5778 + }, + { + "epoch": 4.615814696485623, + "grad_norm": 0.11279458552598953, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5779 + }, + { + "epoch": 4.616613418530352, + "grad_norm": 0.10970400273799896, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5780 + }, + { + "epoch": 4.61741214057508, + "grad_norm": 0.11291752755641937, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5781 + }, + { + "epoch": 4.618210862619808, + "grad_norm": 0.19262762367725372, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5782 + }, + { + "epoch": 4.6190095846645365, + "grad_norm": 0.12736102938652039, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5783 + }, + { + "epoch": 4.619808306709265, + "grad_norm": 0.09300720691680908, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5784 + }, + { + "epoch": 4.6206070287539935, + "grad_norm": 0.09544654190540314, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5785 + }, + { + "epoch": 4.621405750798722, + "grad_norm": 0.2888239026069641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5786 + }, + { + "epoch": 4.622204472843451, + "grad_norm": 0.22988484799861908, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5787 + }, + { + "epoch": 4.623003194888179, + "grad_norm": 0.2574143707752228, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5788 + }, + { + "epoch": 4.623801916932908, + "grad_norm": 0.2503221333026886, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5789 + }, + { + "epoch": 4.624600638977636, + "grad_norm": 0.20846052467823029, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5790 + }, + { + "epoch": 4.625399361022364, + "grad_norm": 0.218403160572052, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5791 + }, + { + "epoch": 4.626198083067092, + "grad_norm": 0.11333920061588287, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5792 + }, + { + "epoch": 4.626996805111821, + "grad_norm": 0.19022895395755768, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5793 + }, + { + "epoch": 4.627795527156549, + "grad_norm": 0.1525644063949585, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 5794 + }, + { + "epoch": 4.628594249201278, + "grad_norm": 0.07636452466249466, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5795 + }, + { + "epoch": 4.6293929712460065, + "grad_norm": 0.1358552873134613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5796 + }, + { + "epoch": 4.630191693290735, + "grad_norm": 0.08993138372898102, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5797 + }, + { + "epoch": 4.6309904153354635, + "grad_norm": 0.15454545617103577, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5798 + }, + { + "epoch": 4.631789137380192, + "grad_norm": 0.12256992608308792, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5799 + }, + { + "epoch": 4.63258785942492, + "grad_norm": 0.08453187346458435, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5800 + }, + { + "epoch": 4.633386581469648, + "grad_norm": 0.1474936157464981, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5801 + }, + { + "epoch": 4.634185303514377, + "grad_norm": 0.11481066793203354, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5802 + }, + { + "epoch": 4.634984025559105, + "grad_norm": 0.41141587495803833, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5803 + }, + { + "epoch": 4.635782747603834, + "grad_norm": 0.1509549766778946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5804 + }, + { + "epoch": 4.636581469648562, + "grad_norm": 0.13562771677970886, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5805 + }, + { + "epoch": 4.637380191693291, + "grad_norm": 0.09722459316253662, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5806 + }, + { + "epoch": 4.638178913738019, + "grad_norm": 0.3194493353366852, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5807 + }, + { + "epoch": 4.638977635782748, + "grad_norm": 0.23091651499271393, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5808 + }, + { + "epoch": 4.6397763578274756, + "grad_norm": 0.1682155877351761, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5809 + }, + { + "epoch": 4.640575079872205, + "grad_norm": 0.37293288111686707, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5810 + }, + { + "epoch": 4.641373801916933, + "grad_norm": 0.3746488094329834, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5811 + }, + { + "epoch": 4.642172523961661, + "grad_norm": 0.2068052738904953, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5812 + }, + { + "epoch": 4.64297124600639, + "grad_norm": 0.13229581713676453, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5813 + }, + { + "epoch": 4.643769968051118, + "grad_norm": 0.24158459901809692, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5814 + }, + { + "epoch": 4.644568690095847, + "grad_norm": 0.4241867959499359, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5815 + }, + { + "epoch": 4.645367412140575, + "grad_norm": 0.40008923411369324, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5816 + }, + { + "epoch": 4.646166134185304, + "grad_norm": 0.3150584101676941, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 5817 + }, + { + "epoch": 4.646964856230032, + "grad_norm": 0.11021434515714645, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5818 + }, + { + "epoch": 4.647763578274761, + "grad_norm": 0.30061402916908264, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5819 + }, + { + "epoch": 4.6485623003194885, + "grad_norm": 0.12583592534065247, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5820 + }, + { + "epoch": 4.649361022364217, + "grad_norm": 0.31917983293533325, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5821 + }, + { + "epoch": 4.6501597444089455, + "grad_norm": 0.2097153663635254, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5822 + }, + { + "epoch": 4.650958466453674, + "grad_norm": 0.19847621023654938, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5823 + }, + { + "epoch": 4.651757188498403, + "grad_norm": 0.2482050508260727, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5824 + }, + { + "epoch": 4.652555910543131, + "grad_norm": 0.1257491409778595, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5825 + }, + { + "epoch": 4.65335463258786, + "grad_norm": 0.2192201465368271, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5826 + }, + { + "epoch": 4.654153354632588, + "grad_norm": 0.16453656554222107, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5827 + }, + { + "epoch": 4.654952076677317, + "grad_norm": 0.18813923001289368, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5828 + }, + { + "epoch": 4.655750798722044, + "grad_norm": 0.1811141073703766, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5829 + }, + { + "epoch": 4.656549520766773, + "grad_norm": 0.08911352604627609, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5830 + }, + { + "epoch": 4.657348242811501, + "grad_norm": 0.17858019471168518, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5831 + }, + { + "epoch": 4.65814696485623, + "grad_norm": 0.27315759658813477, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5832 + }, + { + "epoch": 4.6589456869009584, + "grad_norm": 0.18612337112426758, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5833 + }, + { + "epoch": 4.659744408945687, + "grad_norm": 0.2646125257015228, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5834 + }, + { + "epoch": 4.6605431309904155, + "grad_norm": 0.07320903241634369, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5835 + }, + { + "epoch": 4.661341853035144, + "grad_norm": 0.12969297170639038, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5836 + }, + { + "epoch": 4.662140575079873, + "grad_norm": 0.37665078043937683, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5837 + }, + { + "epoch": 4.6629392971246, + "grad_norm": 0.11055029928684235, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 5838 + }, + { + "epoch": 4.663738019169329, + "grad_norm": 0.12279482185840607, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5839 + }, + { + "epoch": 4.664536741214057, + "grad_norm": 0.0686316192150116, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5840 + }, + { + "epoch": 4.665335463258786, + "grad_norm": 0.09705425798892975, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5841 + }, + { + "epoch": 4.666134185303514, + "grad_norm": 0.09543570131063461, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5842 + }, + { + "epoch": 4.666932907348243, + "grad_norm": 0.08460460603237152, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5843 + }, + { + "epoch": 4.667731629392971, + "grad_norm": 0.12419378757476807, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5844 + }, + { + "epoch": 4.6685303514377, + "grad_norm": 0.09184019267559052, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5845 + }, + { + "epoch": 4.669329073482428, + "grad_norm": 0.09425100684165955, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5846 + }, + { + "epoch": 4.670127795527156, + "grad_norm": 0.19701971113681793, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5847 + }, + { + "epoch": 4.6709265175718855, + "grad_norm": 0.0648239254951477, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5848 + }, + { + "epoch": 4.671725239616613, + "grad_norm": 0.11558888107538223, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5849 + }, + { + "epoch": 4.672523961661342, + "grad_norm": 0.12397976219654083, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5850 + }, + { + "epoch": 4.67332268370607, + "grad_norm": 0.10640132427215576, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5851 + }, + { + "epoch": 4.674121405750799, + "grad_norm": 0.08930578827857971, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5852 + }, + { + "epoch": 4.674920127795527, + "grad_norm": 0.06212310120463371, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5853 + }, + { + "epoch": 4.675718849840256, + "grad_norm": 0.08568188548088074, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5854 + }, + { + "epoch": 4.676517571884984, + "grad_norm": 0.11431021988391876, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5855 + }, + { + "epoch": 4.677316293929713, + "grad_norm": 0.34381258487701416, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5856 + }, + { + "epoch": 4.678115015974441, + "grad_norm": 0.1996181309223175, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5857 + }, + { + "epoch": 4.678913738019169, + "grad_norm": 0.2900290787220001, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5858 + }, + { + "epoch": 4.6797124600638975, + "grad_norm": 0.35768410563468933, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5859 + }, + { + "epoch": 4.680511182108626, + "grad_norm": 0.1027536615729332, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5860 + }, + { + "epoch": 4.681309904153355, + "grad_norm": 0.6286419630050659, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5861 + }, + { + "epoch": 4.682108626198083, + "grad_norm": 0.5037242770195007, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5862 + }, + { + "epoch": 4.682907348242812, + "grad_norm": 0.34654417634010315, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5863 + }, + { + "epoch": 4.68370607028754, + "grad_norm": 0.18139366805553436, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5864 + }, + { + "epoch": 4.684504792332269, + "grad_norm": 0.2101605087518692, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5865 + }, + { + "epoch": 4.685303514376997, + "grad_norm": 0.0922360047698021, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5866 + }, + { + "epoch": 4.686102236421725, + "grad_norm": 0.23476624488830566, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5867 + }, + { + "epoch": 4.686900958466453, + "grad_norm": 0.1843792051076889, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5868 + }, + { + "epoch": 4.687699680511182, + "grad_norm": 0.09449298679828644, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5869 + }, + { + "epoch": 4.68849840255591, + "grad_norm": 0.13996686041355133, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5870 + }, + { + "epoch": 4.689297124600639, + "grad_norm": 2.113325357437134, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5871 + }, + { + "epoch": 4.6900958466453675, + "grad_norm": 0.35181209444999695, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 5872 + }, + { + "epoch": 4.690894568690096, + "grad_norm": 0.3530768156051636, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 5873 + }, + { + "epoch": 4.6916932907348246, + "grad_norm": 0.25919783115386963, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5874 + }, + { + "epoch": 4.692492012779553, + "grad_norm": 0.19770720601081848, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 5875 + }, + { + "epoch": 4.693290734824281, + "grad_norm": 0.32085585594177246, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5876 + }, + { + "epoch": 4.694089456869009, + "grad_norm": 0.14215363562107086, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5877 + }, + { + "epoch": 4.694888178913738, + "grad_norm": 0.24502497911453247, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5878 + }, + { + "epoch": 4.695686900958466, + "grad_norm": 0.15765784680843353, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5879 + }, + { + "epoch": 4.696485623003195, + "grad_norm": 0.13945002853870392, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5880 + }, + { + "epoch": 4.697284345047923, + "grad_norm": 0.16315795481204987, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5881 + }, + { + "epoch": 4.698083067092652, + "grad_norm": 0.0803297907114029, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5882 + }, + { + "epoch": 4.69888178913738, + "grad_norm": 0.09848042577505112, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5883 + }, + { + "epoch": 4.699680511182109, + "grad_norm": 0.22370465099811554, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5884 + }, + { + "epoch": 4.700479233226837, + "grad_norm": 0.09369395673274994, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5885 + }, + { + "epoch": 4.701277955271565, + "grad_norm": 0.42340102791786194, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5886 + }, + { + "epoch": 4.702076677316294, + "grad_norm": 0.08471440523862839, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5887 + }, + { + "epoch": 4.702875399361022, + "grad_norm": 0.11350758373737335, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5888 + }, + { + "epoch": 4.703674121405751, + "grad_norm": 0.16862216591835022, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5889 + }, + { + "epoch": 4.704472843450479, + "grad_norm": 0.17468953132629395, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5890 + }, + { + "epoch": 4.705271565495208, + "grad_norm": 0.09154370427131653, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5891 + }, + { + "epoch": 4.706070287539936, + "grad_norm": 0.08715084940195084, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5892 + }, + { + "epoch": 4.706869009584665, + "grad_norm": 0.06797291338443756, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5893 + }, + { + "epoch": 4.707667731629393, + "grad_norm": 0.17333610355854034, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5894 + }, + { + "epoch": 4.708466453674122, + "grad_norm": 0.17272767424583435, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5895 + }, + { + "epoch": 4.7092651757188495, + "grad_norm": 0.11773357540369034, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5896 + }, + { + "epoch": 4.710063897763578, + "grad_norm": 0.08420758694410324, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5897 + }, + { + "epoch": 4.710862619808307, + "grad_norm": 0.08672801405191422, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5898 + }, + { + "epoch": 4.711661341853035, + "grad_norm": 0.2356635183095932, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5899 + }, + { + "epoch": 4.712460063897764, + "grad_norm": 0.06091082841157913, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5900 + }, + { + "epoch": 4.713258785942492, + "grad_norm": 0.09156842529773712, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5901 + }, + { + "epoch": 4.714057507987221, + "grad_norm": 0.06548108160495758, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5902 + }, + { + "epoch": 4.714856230031949, + "grad_norm": 0.12813016772270203, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5903 + }, + { + "epoch": 4.715654952076678, + "grad_norm": 0.1518833339214325, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5904 + }, + { + "epoch": 4.716453674121405, + "grad_norm": 0.09331580996513367, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5905 + }, + { + "epoch": 4.717252396166134, + "grad_norm": 0.11989843845367432, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5906 + }, + { + "epoch": 4.718051118210862, + "grad_norm": 0.1277054399251938, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5907 + }, + { + "epoch": 4.718849840255591, + "grad_norm": 0.11199159920215607, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5908 + }, + { + "epoch": 4.7196485623003195, + "grad_norm": 0.09120891988277435, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5909 + }, + { + "epoch": 4.720447284345048, + "grad_norm": 0.11668230593204498, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5910 + }, + { + "epoch": 4.7212460063897765, + "grad_norm": 0.08594206720590591, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5911 + }, + { + "epoch": 4.722044728434505, + "grad_norm": 0.11563027650117874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5912 + }, + { + "epoch": 4.722843450479234, + "grad_norm": 0.15066663920879364, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5913 + }, + { + "epoch": 4.723642172523961, + "grad_norm": 0.08566875755786896, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5914 + }, + { + "epoch": 4.72444089456869, + "grad_norm": 0.060813747346401215, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5915 + }, + { + "epoch": 4.725239616613418, + "grad_norm": 0.07391642779111862, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5916 + }, + { + "epoch": 4.726038338658147, + "grad_norm": 0.04867766425013542, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5917 + }, + { + "epoch": 4.726837060702875, + "grad_norm": 0.09468305110931396, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5918 + }, + { + "epoch": 4.727635782747604, + "grad_norm": 0.07287945598363876, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5919 + }, + { + "epoch": 4.728434504792332, + "grad_norm": 0.08984806388616562, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5920 + }, + { + "epoch": 4.729233226837061, + "grad_norm": 0.1755092740058899, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5921 + }, + { + "epoch": 4.7300319488817895, + "grad_norm": 0.09656399488449097, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5922 + }, + { + "epoch": 4.730830670926517, + "grad_norm": 0.15759015083312988, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5923 + }, + { + "epoch": 4.731629392971246, + "grad_norm": 0.13238383829593658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5924 + }, + { + "epoch": 4.732428115015974, + "grad_norm": 0.05352601036429405, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5925 + }, + { + "epoch": 4.733226837060703, + "grad_norm": 0.06253937631845474, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5926 + }, + { + "epoch": 4.734025559105431, + "grad_norm": 0.057317376136779785, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5927 + }, + { + "epoch": 4.73482428115016, + "grad_norm": 0.12154382467269897, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5928 + }, + { + "epoch": 4.735623003194888, + "grad_norm": 0.0547759085893631, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5929 + }, + { + "epoch": 4.736421725239617, + "grad_norm": 0.07446085661649704, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5930 + }, + { + "epoch": 4.737220447284345, + "grad_norm": 0.09809007495641708, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5931 + }, + { + "epoch": 4.738019169329074, + "grad_norm": 0.12434732168912888, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5932 + }, + { + "epoch": 4.738817891373802, + "grad_norm": 0.12192053347826004, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5933 + }, + { + "epoch": 4.73961661341853, + "grad_norm": 0.08006733655929565, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5934 + }, + { + "epoch": 4.7404153354632586, + "grad_norm": 0.14677436649799347, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5935 + }, + { + "epoch": 4.741214057507987, + "grad_norm": 0.10133987665176392, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5936 + }, + { + "epoch": 4.742012779552716, + "grad_norm": 0.10331577062606812, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5937 + }, + { + "epoch": 4.742811501597444, + "grad_norm": 0.14596082270145416, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5938 + }, + { + "epoch": 4.743610223642173, + "grad_norm": 0.15139590203762054, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5939 + }, + { + "epoch": 4.744408945686901, + "grad_norm": 0.0935182124376297, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5940 + }, + { + "epoch": 4.74520766773163, + "grad_norm": 0.1002865880727768, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5941 + }, + { + "epoch": 4.746006389776358, + "grad_norm": 0.0968283861875534, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5942 + }, + { + "epoch": 4.746805111821086, + "grad_norm": 0.11680585891008377, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5943 + }, + { + "epoch": 4.747603833865814, + "grad_norm": 0.12163184583187103, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 5944 + }, + { + "epoch": 4.748402555910543, + "grad_norm": 0.07288502901792526, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5945 + }, + { + "epoch": 4.7492012779552715, + "grad_norm": 0.3335740566253662, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5946 + }, + { + "epoch": 4.75, + "grad_norm": 0.15408654510974884, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5947 + }, + { + "epoch": 4.7507987220447285, + "grad_norm": 0.09612353891134262, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5948 + }, + { + "epoch": 4.751597444089457, + "grad_norm": 0.10403789579868317, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5949 + }, + { + "epoch": 4.752396166134186, + "grad_norm": 0.13026492297649384, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5950 + }, + { + "epoch": 4.753194888178914, + "grad_norm": 0.061955004930496216, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5951 + }, + { + "epoch": 4.753993610223642, + "grad_norm": 0.08264514058828354, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5952 + }, + { + "epoch": 4.75479233226837, + "grad_norm": 0.1132993996143341, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5953 + }, + { + "epoch": 4.755591054313099, + "grad_norm": 0.09022228419780731, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5954 + }, + { + "epoch": 4.756389776357827, + "grad_norm": 0.13192631304264069, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5955 + }, + { + "epoch": 4.757188498402556, + "grad_norm": 0.08400337398052216, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5956 + }, + { + "epoch": 4.757987220447284, + "grad_norm": 0.05070018023252487, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5957 + }, + { + "epoch": 4.758785942492013, + "grad_norm": 0.09561482816934586, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5958 + }, + { + "epoch": 4.7595846645367414, + "grad_norm": 0.07369764894247055, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5959 + }, + { + "epoch": 4.76038338658147, + "grad_norm": 0.07777421176433563, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5960 + }, + { + "epoch": 4.761182108626198, + "grad_norm": 0.11525892466306686, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5961 + }, + { + "epoch": 4.761980830670926, + "grad_norm": 0.1788506656885147, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5962 + }, + { + "epoch": 4.762779552715655, + "grad_norm": 0.10067635029554367, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5963 + }, + { + "epoch": 4.763578274760383, + "grad_norm": 0.08447863161563873, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5964 + }, + { + "epoch": 4.764376996805112, + "grad_norm": 0.06801758706569672, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5965 + }, + { + "epoch": 4.76517571884984, + "grad_norm": 0.07363327592611313, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5966 + }, + { + "epoch": 4.765974440894569, + "grad_norm": 0.05584784597158432, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5967 + }, + { + "epoch": 4.766773162939297, + "grad_norm": 0.10064459592103958, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5968 + }, + { + "epoch": 4.767571884984026, + "grad_norm": 0.1176871508359909, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5969 + }, + { + "epoch": 4.768370607028754, + "grad_norm": 0.17485690116882324, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5970 + }, + { + "epoch": 4.769169329073483, + "grad_norm": 0.15753531455993652, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5971 + }, + { + "epoch": 4.7699680511182105, + "grad_norm": 0.1669864058494568, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5972 + }, + { + "epoch": 4.770766773162939, + "grad_norm": 0.07706131786108017, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5973 + }, + { + "epoch": 4.771565495207668, + "grad_norm": 0.3537883460521698, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5974 + }, + { + "epoch": 4.772364217252396, + "grad_norm": 0.20092372596263885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5975 + }, + { + "epoch": 4.773162939297125, + "grad_norm": 0.06521142274141312, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5976 + }, + { + "epoch": 4.773961661341853, + "grad_norm": 0.1203140988945961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5977 + }, + { + "epoch": 4.774760383386582, + "grad_norm": 0.09655500948429108, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5978 + }, + { + "epoch": 4.77555910543131, + "grad_norm": 0.09220302104949951, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5979 + }, + { + "epoch": 4.776357827476039, + "grad_norm": 0.7336251735687256, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5980 + }, + { + "epoch": 4.777156549520766, + "grad_norm": 0.21415477991104126, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5981 + }, + { + "epoch": 4.777955271565495, + "grad_norm": 0.14869220554828644, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5982 + }, + { + "epoch": 4.7787539936102235, + "grad_norm": 0.0779772400856018, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5983 + }, + { + "epoch": 4.779552715654952, + "grad_norm": 0.14274317026138306, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5984 + }, + { + "epoch": 4.7803514376996805, + "grad_norm": 0.11580413579940796, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5985 + }, + { + "epoch": 4.781150159744409, + "grad_norm": 0.055023401975631714, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5986 + }, + { + "epoch": 4.781948881789138, + "grad_norm": 0.11657343804836273, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5987 + }, + { + "epoch": 4.782747603833866, + "grad_norm": 0.07336080819368362, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5988 + }, + { + "epoch": 4.783546325878595, + "grad_norm": 0.06066504120826721, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5989 + }, + { + "epoch": 4.784345047923322, + "grad_norm": 0.05784285068511963, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5990 + }, + { + "epoch": 4.785143769968051, + "grad_norm": 0.06317969411611557, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5991 + }, + { + "epoch": 4.785942492012779, + "grad_norm": 0.1001245379447937, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5992 + }, + { + "epoch": 4.786741214057508, + "grad_norm": 0.0743420347571373, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5993 + }, + { + "epoch": 4.787539936102236, + "grad_norm": 0.07082799077033997, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5994 + }, + { + "epoch": 4.788338658146965, + "grad_norm": 0.11087984591722488, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5995 + }, + { + "epoch": 4.789137380191693, + "grad_norm": 0.05923386290669441, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5996 + }, + { + "epoch": 4.789936102236422, + "grad_norm": 0.1020246297121048, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5997 + }, + { + "epoch": 4.7907348242811505, + "grad_norm": 0.11524185538291931, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5998 + }, + { + "epoch": 4.791533546325878, + "grad_norm": 0.06959006190299988, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5999 + }, + { + "epoch": 4.792332268370607, + "grad_norm": 0.19179846346378326, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6000 + }, + { + "epoch": 4.793130990415335, + "grad_norm": 0.17232562601566315, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6001 + }, + { + "epoch": 4.793929712460064, + "grad_norm": 0.7047739028930664, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6002 + }, + { + "epoch": 4.794728434504792, + "grad_norm": 0.09086379408836365, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6003 + }, + { + "epoch": 4.795527156549521, + "grad_norm": 0.17785955965518951, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6004 + }, + { + "epoch": 4.796325878594249, + "grad_norm": 0.09529274702072144, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6005 + }, + { + "epoch": 4.797124600638978, + "grad_norm": 0.08041567355394363, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6006 + }, + { + "epoch": 4.797923322683706, + "grad_norm": 0.13888375461101532, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6007 + }, + { + "epoch": 4.798722044728435, + "grad_norm": 0.08110564947128296, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6008 + }, + { + "epoch": 4.799520766773163, + "grad_norm": 0.07443006336688995, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6009 + }, + { + "epoch": 4.800319488817891, + "grad_norm": 0.08499104529619217, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6010 + }, + { + "epoch": 4.80111821086262, + "grad_norm": 0.0616084523499012, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6011 + }, + { + "epoch": 4.801916932907348, + "grad_norm": 0.10845918208360672, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6012 + }, + { + "epoch": 4.802715654952077, + "grad_norm": 0.057658810168504715, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6013 + }, + { + "epoch": 4.803514376996805, + "grad_norm": 0.07163018733263016, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6014 + }, + { + "epoch": 4.804313099041534, + "grad_norm": 0.07016896456480026, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6015 + }, + { + "epoch": 4.805111821086262, + "grad_norm": 0.08233597129583359, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6016 + }, + { + "epoch": 4.805910543130991, + "grad_norm": 0.05408332124352455, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6017 + }, + { + "epoch": 4.806709265175719, + "grad_norm": 0.0886560007929802, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6018 + }, + { + "epoch": 4.807507987220447, + "grad_norm": 0.17860093712806702, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6019 + }, + { + "epoch": 4.8083067092651754, + "grad_norm": 0.26264694333076477, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6020 + }, + { + "epoch": 4.809105431309904, + "grad_norm": 0.08523311465978622, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6021 + }, + { + "epoch": 4.8099041533546325, + "grad_norm": 0.09873831272125244, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6022 + }, + { + "epoch": 4.810702875399361, + "grad_norm": 0.16135412454605103, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6023 + }, + { + "epoch": 4.81150159744409, + "grad_norm": 0.08003875613212585, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6024 + }, + { + "epoch": 4.812300319488818, + "grad_norm": 0.09117014706134796, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6025 + }, + { + "epoch": 4.813099041533547, + "grad_norm": 0.2316243052482605, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6026 + }, + { + "epoch": 4.813897763578275, + "grad_norm": 0.16050362586975098, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6027 + }, + { + "epoch": 4.814696485623003, + "grad_norm": 0.13559919595718384, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6028 + }, + { + "epoch": 4.815495207667731, + "grad_norm": 0.08917123824357986, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6029 + }, + { + "epoch": 4.81629392971246, + "grad_norm": 0.11498702317476273, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6030 + }, + { + "epoch": 4.817092651757188, + "grad_norm": 0.14677700400352478, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6031 + }, + { + "epoch": 4.817891373801917, + "grad_norm": 0.08849102258682251, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6032 + }, + { + "epoch": 4.818690095846645, + "grad_norm": 4.0974507331848145, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6033 + }, + { + "epoch": 4.819488817891374, + "grad_norm": 0.24215161800384521, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6034 + }, + { + "epoch": 4.8202875399361025, + "grad_norm": 0.2679882049560547, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6035 + }, + { + "epoch": 4.821086261980831, + "grad_norm": 0.11113203316926956, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6036 + }, + { + "epoch": 4.821884984025559, + "grad_norm": 0.17725592851638794, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6037 + }, + { + "epoch": 4.822683706070287, + "grad_norm": 0.08446694165468216, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6038 + }, + { + "epoch": 4.823482428115016, + "grad_norm": 0.26757946610450745, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6039 + }, + { + "epoch": 4.824281150159744, + "grad_norm": 0.1900561898946762, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6040 + }, + { + "epoch": 4.825079872204473, + "grad_norm": 0.21993426978588104, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6041 + }, + { + "epoch": 4.825878594249201, + "grad_norm": 15.862943649291992, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6042 + }, + { + "epoch": 4.82667731629393, + "grad_norm": 0.793515145778656, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6043 + }, + { + "epoch": 4.827476038338658, + "grad_norm": 0.5607691407203674, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6044 + }, + { + "epoch": 4.828274760383387, + "grad_norm": 0.2853091359138489, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6045 + }, + { + "epoch": 4.8290734824281145, + "grad_norm": 0.3579944670200348, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6046 + }, + { + "epoch": 4.829872204472844, + "grad_norm": 0.26784929633140564, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6047 + }, + { + "epoch": 4.830670926517572, + "grad_norm": 0.2363428920507431, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6048 + }, + { + "epoch": 4.8314696485623, + "grad_norm": 0.2922425866127014, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6049 + }, + { + "epoch": 4.832268370607029, + "grad_norm": 0.2173125147819519, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6050 + }, + { + "epoch": 4.833067092651757, + "grad_norm": 0.23552696406841278, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6051 + }, + { + "epoch": 4.833865814696486, + "grad_norm": 1.2383053302764893, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6052 + }, + { + "epoch": 4.834664536741214, + "grad_norm": 0.3284873366355896, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6053 + }, + { + "epoch": 4.835463258785943, + "grad_norm": 0.15584628283977509, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6054 + }, + { + "epoch": 4.836261980830671, + "grad_norm": 0.3136327862739563, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6055 + }, + { + "epoch": 4.8370607028754, + "grad_norm": 0.19863441586494446, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6056 + }, + { + "epoch": 4.837859424920127, + "grad_norm": 0.273644357919693, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6057 + }, + { + "epoch": 4.838658146964856, + "grad_norm": 0.2560950517654419, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6058 + }, + { + "epoch": 4.8394568690095845, + "grad_norm": 0.2243220955133438, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6059 + }, + { + "epoch": 4.840255591054313, + "grad_norm": 0.16328522562980652, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6060 + }, + { + "epoch": 4.8410543130990416, + "grad_norm": 0.42267754673957825, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6061 + }, + { + "epoch": 4.84185303514377, + "grad_norm": 0.21733495593070984, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6062 + }, + { + "epoch": 4.842651757188499, + "grad_norm": 0.12917862832546234, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6063 + }, + { + "epoch": 4.843450479233227, + "grad_norm": 0.1829921007156372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6064 + }, + { + "epoch": 4.844249201277956, + "grad_norm": 0.08751819282770157, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6065 + }, + { + "epoch": 4.845047923322683, + "grad_norm": 0.16521455347537994, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6066 + }, + { + "epoch": 4.845846645367412, + "grad_norm": 0.4328543543815613, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6067 + }, + { + "epoch": 4.84664536741214, + "grad_norm": 0.2682073712348938, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6068 + }, + { + "epoch": 4.847444089456869, + "grad_norm": 0.15217293798923492, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6069 + }, + { + "epoch": 4.848242811501597, + "grad_norm": 0.12807190418243408, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 6070 + }, + { + "epoch": 4.849041533546326, + "grad_norm": 1.4503207206726074, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6071 + }, + { + "epoch": 4.8498402555910545, + "grad_norm": 0.5045278668403625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6072 + }, + { + "epoch": 4.850638977635783, + "grad_norm": 0.1992882788181305, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6073 + }, + { + "epoch": 4.8514376996805115, + "grad_norm": 0.3178166151046753, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 6074 + }, + { + "epoch": 4.852236421725239, + "grad_norm": 0.1244354322552681, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6075 + }, + { + "epoch": 4.853035143769968, + "grad_norm": 0.2837885320186615, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 6076 + }, + { + "epoch": 4.853833865814696, + "grad_norm": 0.11910229921340942, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6077 + }, + { + "epoch": 4.854632587859425, + "grad_norm": 0.5774815678596497, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6078 + }, + { + "epoch": 4.855431309904153, + "grad_norm": 0.13028140366077423, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6079 + }, + { + "epoch": 4.856230031948882, + "grad_norm": 0.21022816002368927, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6080 + }, + { + "epoch": 4.85702875399361, + "grad_norm": 0.11758062243461609, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6081 + }, + { + "epoch": 4.857827476038339, + "grad_norm": 0.1321621984243393, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6082 + }, + { + "epoch": 4.858626198083067, + "grad_norm": 0.11481605470180511, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6083 + }, + { + "epoch": 4.859424920127795, + "grad_norm": 0.0976998507976532, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6084 + }, + { + "epoch": 4.8602236421725244, + "grad_norm": 0.7211679220199585, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6085 + }, + { + "epoch": 4.861022364217252, + "grad_norm": 0.1417546272277832, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6086 + }, + { + "epoch": 4.861821086261981, + "grad_norm": 0.13830699026584625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6087 + }, + { + "epoch": 4.862619808306709, + "grad_norm": 0.24840030074119568, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6088 + }, + { + "epoch": 4.863418530351438, + "grad_norm": 3.442054033279419, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6089 + }, + { + "epoch": 4.864217252396166, + "grad_norm": 0.21404840052127838, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 6090 + }, + { + "epoch": 4.865015974440895, + "grad_norm": 0.3657711148262024, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6091 + }, + { + "epoch": 4.865814696485623, + "grad_norm": 0.2189537137746811, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6092 + }, + { + "epoch": 4.866613418530352, + "grad_norm": 0.17866109311580658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6093 + }, + { + "epoch": 4.86741214057508, + "grad_norm": 0.19208978116512299, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6094 + }, + { + "epoch": 4.868210862619808, + "grad_norm": 0.08330709487199783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6095 + }, + { + "epoch": 4.8690095846645365, + "grad_norm": 0.1194678544998169, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6096 + }, + { + "epoch": 4.869808306709265, + "grad_norm": 0.07852908223867416, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6097 + }, + { + "epoch": 4.8706070287539935, + "grad_norm": 0.09230814129114151, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 6098 + }, + { + "epoch": 4.871405750798722, + "grad_norm": 0.06775277107954025, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6099 + }, + { + "epoch": 4.872204472843451, + "grad_norm": 0.28747716546058655, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6100 + }, + { + "epoch": 4.873003194888179, + "grad_norm": 0.11956486105918884, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6101 + }, + { + "epoch": 4.873801916932908, + "grad_norm": 0.09843557327985764, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6102 + }, + { + "epoch": 4.874600638977636, + "grad_norm": 0.08408313244581223, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6103 + }, + { + "epoch": 4.875399361022364, + "grad_norm": 0.08230917155742645, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6104 + }, + { + "epoch": 4.876198083067092, + "grad_norm": 0.08927451819181442, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6105 + }, + { + "epoch": 4.876996805111821, + "grad_norm": 0.5961875319480896, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6106 + }, + { + "epoch": 4.877795527156549, + "grad_norm": 0.5851842164993286, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 6107 + }, + { + "epoch": 4.878594249201278, + "grad_norm": 0.4428717792034149, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6108 + }, + { + "epoch": 4.8793929712460065, + "grad_norm": 3.760467052459717, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 6109 + }, + { + "epoch": 4.880191693290735, + "grad_norm": 84.49950408935547, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 6110 + }, + { + "epoch": 4.8809904153354635, + "grad_norm": 66320516.0, + "learning_rate": 0.0005, + "loss": 1.1423, + "step": 6111 + }, + { + "epoch": 4.881789137380192, + "grad_norm": 676613568.0, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 6112 + }, + { + "epoch": 4.88258785942492, + "grad_norm": 2556641280.0, + "learning_rate": 0.0005, + "loss": 1.2458, + "step": 6113 + }, + { + "epoch": 4.883386581469648, + "grad_norm": 21960.341796875, + "learning_rate": 0.0005, + "loss": 1.3163, + "step": 6114 + }, + { + "epoch": 4.884185303514377, + "grad_norm": 3668.3603515625, + "learning_rate": 0.0005, + "loss": 1.4954, + "step": 6115 + }, + { + "epoch": 4.884984025559105, + "grad_norm": 9.501830101013184, + "learning_rate": 0.0005, + "loss": 2.0388, + "step": 6116 + }, + { + "epoch": 4.885782747603834, + "grad_norm": 1.9570647478103638, + "learning_rate": 0.0005, + "loss": 1.3693, + "step": 6117 + }, + { + "epoch": 4.886581469648562, + "grad_norm": 0.9678036570549011, + "learning_rate": 0.0005, + "loss": 1.2694, + "step": 6118 + }, + { + "epoch": 4.887380191693291, + "grad_norm": 0.7094120383262634, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 6119 + }, + { + "epoch": 4.888178913738019, + "grad_norm": 0.4029041826725006, + "learning_rate": 0.0005, + "loss": 1.1809, + "step": 6120 + }, + { + "epoch": 4.888977635782748, + "grad_norm": 0.8682520389556885, + "learning_rate": 0.0005, + "loss": 1.1689, + "step": 6121 + }, + { + "epoch": 4.8897763578274756, + "grad_norm": 0.5829207301139832, + "learning_rate": 0.0005, + "loss": 1.1921, + "step": 6122 + }, + { + "epoch": 4.890575079872205, + "grad_norm": 0.5038579702377319, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6123 + }, + { + "epoch": 4.891373801916933, + "grad_norm": 0.532597005367279, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6124 + }, + { + "epoch": 4.892172523961661, + "grad_norm": 0.20122192800045013, + "learning_rate": 0.0005, + "loss": 1.1399, + "step": 6125 + }, + { + "epoch": 4.89297124600639, + "grad_norm": 0.22419369220733643, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 6126 + }, + { + "epoch": 4.893769968051118, + "grad_norm": 0.2319759726524353, + "learning_rate": 0.0005, + "loss": 1.13, + "step": 6127 + }, + { + "epoch": 4.894568690095847, + "grad_norm": 0.18733178079128265, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 6128 + }, + { + "epoch": 4.895367412140575, + "grad_norm": 0.35497167706489563, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 6129 + }, + { + "epoch": 4.896166134185304, + "grad_norm": 0.2551584243774414, + "learning_rate": 0.0005, + "loss": 1.1236, + "step": 6130 + }, + { + "epoch": 4.896964856230032, + "grad_norm": 0.337982714176178, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 6131 + }, + { + "epoch": 4.897763578274761, + "grad_norm": 0.2945634722709656, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 6132 + }, + { + "epoch": 4.8985623003194885, + "grad_norm": 0.2571047842502594, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 6133 + }, + { + "epoch": 4.899361022364217, + "grad_norm": 0.23297041654586792, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 6134 + }, + { + "epoch": 4.9001597444089455, + "grad_norm": 0.24131764471530914, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 6135 + }, + { + "epoch": 4.900958466453674, + "grad_norm": 0.22283275425434113, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 6136 + }, + { + "epoch": 4.901757188498403, + "grad_norm": 0.1691826730966568, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 6137 + }, + { + "epoch": 4.902555910543131, + "grad_norm": 0.1532466858625412, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 6138 + }, + { + "epoch": 4.90335463258786, + "grad_norm": 0.14135177433490753, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 6139 + }, + { + "epoch": 4.904153354632588, + "grad_norm": 0.14410537481307983, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 6140 + }, + { + "epoch": 4.904952076677317, + "grad_norm": 0.1097448468208313, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 6141 + }, + { + "epoch": 4.905750798722044, + "grad_norm": 0.0851673111319542, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 6142 + }, + { + "epoch": 4.906549520766773, + "grad_norm": 0.13842107355594635, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 6143 + }, + { + "epoch": 4.907348242811501, + "grad_norm": 0.15126317739486694, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 6144 + }, + { + "epoch": 4.90814696485623, + "grad_norm": 0.13176177442073822, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6145 + }, + { + "epoch": 4.9089456869009584, + "grad_norm": 0.164788156747818, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 6146 + }, + { + "epoch": 4.909744408945687, + "grad_norm": 0.24943718314170837, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6147 + }, + { + "epoch": 4.9105431309904155, + "grad_norm": 0.4325760304927826, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 6148 + }, + { + "epoch": 4.911341853035144, + "grad_norm": 0.5711309313774109, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 6149 + }, + { + "epoch": 4.912140575079873, + "grad_norm": 0.37636998295783997, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 6150 + }, + { + "epoch": 4.9129392971246, + "grad_norm": 0.2788292169570923, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 6151 + }, + { + "epoch": 4.913738019169329, + "grad_norm": 0.31709909439086914, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 6152 + }, + { + "epoch": 4.914536741214057, + "grad_norm": 0.14585916697978973, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6153 + }, + { + "epoch": 4.915335463258786, + "grad_norm": 0.1302923858165741, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 6154 + }, + { + "epoch": 4.916134185303514, + "grad_norm": 0.16156400740146637, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6155 + }, + { + "epoch": 4.916932907348243, + "grad_norm": 0.2323192059993744, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6156 + }, + { + "epoch": 4.917731629392971, + "grad_norm": 0.17504405975341797, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 6157 + }, + { + "epoch": 4.9185303514377, + "grad_norm": 0.07211807370185852, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6158 + }, + { + "epoch": 4.919329073482428, + "grad_norm": 0.26426371932029724, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6159 + }, + { + "epoch": 4.920127795527156, + "grad_norm": 0.237858384847641, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 6160 + }, + { + "epoch": 4.9209265175718855, + "grad_norm": 0.23863473534584045, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6161 + }, + { + "epoch": 4.921725239616613, + "grad_norm": 0.3053814768791199, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6162 + }, + { + "epoch": 4.922523961661342, + "grad_norm": 0.2143447995185852, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6163 + }, + { + "epoch": 4.92332268370607, + "grad_norm": 0.12295633554458618, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 6164 + }, + { + "epoch": 4.924121405750799, + "grad_norm": 0.11128787696361542, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6165 + }, + { + "epoch": 4.924920127795527, + "grad_norm": 0.158652663230896, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 6166 + }, + { + "epoch": 4.925718849840256, + "grad_norm": 0.17612649500370026, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6167 + }, + { + "epoch": 4.926517571884984, + "grad_norm": 0.12243206799030304, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6168 + }, + { + "epoch": 4.927316293929713, + "grad_norm": 0.12234453856945038, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6169 + }, + { + "epoch": 4.928115015974441, + "grad_norm": 0.1968356966972351, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6170 + }, + { + "epoch": 4.928913738019169, + "grad_norm": 0.17286576330661774, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6171 + }, + { + "epoch": 4.9297124600638975, + "grad_norm": 0.0847749337553978, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6172 + }, + { + "epoch": 4.930511182108626, + "grad_norm": 0.0704331174492836, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6173 + }, + { + "epoch": 4.931309904153355, + "grad_norm": 0.12671123445034027, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6174 + }, + { + "epoch": 4.932108626198083, + "grad_norm": 0.10653524100780487, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6175 + }, + { + "epoch": 4.932907348242812, + "grad_norm": 0.0606958381831646, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6176 + }, + { + "epoch": 4.93370607028754, + "grad_norm": 0.12248247116804123, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6177 + }, + { + "epoch": 4.934504792332269, + "grad_norm": 0.1370074301958084, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6178 + }, + { + "epoch": 4.935303514376997, + "grad_norm": 0.05940835922956467, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6179 + }, + { + "epoch": 4.936102236421725, + "grad_norm": 0.1440308690071106, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6180 + }, + { + "epoch": 4.936900958466453, + "grad_norm": 0.1972372829914093, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6181 + }, + { + "epoch": 4.937699680511182, + "grad_norm": 0.10575850307941437, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6182 + }, + { + "epoch": 4.93849840255591, + "grad_norm": 0.11902400851249695, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6183 + }, + { + "epoch": 4.939297124600639, + "grad_norm": 0.15276090800762177, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6184 + }, + { + "epoch": 4.9400958466453675, + "grad_norm": 0.07495112717151642, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6185 + }, + { + "epoch": 4.940894568690096, + "grad_norm": 0.10652542859315872, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6186 + }, + { + "epoch": 4.9416932907348246, + "grad_norm": 0.11347164958715439, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6187 + }, + { + "epoch": 4.942492012779553, + "grad_norm": 0.19946135580539703, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6188 + }, + { + "epoch": 4.943290734824281, + "grad_norm": 0.0771450325846672, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6189 + }, + { + "epoch": 4.944089456869009, + "grad_norm": 0.1086430475115776, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6190 + }, + { + "epoch": 4.944888178913738, + "grad_norm": 0.08790839463472366, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6191 + }, + { + "epoch": 4.945686900958466, + "grad_norm": 0.22063800692558289, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6192 + }, + { + "epoch": 4.946485623003195, + "grad_norm": 0.22287815809249878, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6193 + }, + { + "epoch": 4.947284345047923, + "grad_norm": 1.695265769958496, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6194 + }, + { + "epoch": 4.948083067092652, + "grad_norm": 0.6316840052604675, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 6195 + }, + { + "epoch": 4.94888178913738, + "grad_norm": 0.35637202858924866, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 6196 + }, + { + "epoch": 4.949680511182109, + "grad_norm": 0.2844616174697876, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 6197 + }, + { + "epoch": 4.950479233226837, + "grad_norm": 0.19614022970199585, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 6198 + }, + { + "epoch": 4.951277955271565, + "grad_norm": 0.3665562868118286, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 6199 + }, + { + "epoch": 4.952076677316294, + "grad_norm": 0.1485169231891632, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 6200 + }, + { + "epoch": 4.952875399361022, + "grad_norm": 0.19647273421287537, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6201 + }, + { + "epoch": 4.953674121405751, + "grad_norm": 0.19809085130691528, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6202 + }, + { + "epoch": 4.954472843450479, + "grad_norm": 0.1129874736070633, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6203 + }, + { + "epoch": 4.955271565495208, + "grad_norm": 0.2082832157611847, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6204 + }, + { + "epoch": 4.956070287539936, + "grad_norm": 0.20414425432682037, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6205 + }, + { + "epoch": 4.956869009584665, + "grad_norm": 0.16667422652244568, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6206 + }, + { + "epoch": 4.957667731629393, + "grad_norm": 0.25111839175224304, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 6207 + }, + { + "epoch": 4.958466453674122, + "grad_norm": 0.16995272040367126, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 6208 + }, + { + "epoch": 4.9592651757188495, + "grad_norm": 0.10725044459104538, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6209 + }, + { + "epoch": 4.960063897763578, + "grad_norm": 0.17728300392627716, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6210 + }, + { + "epoch": 4.960862619808307, + "grad_norm": 0.1334110051393509, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6211 + }, + { + "epoch": 4.961661341853035, + "grad_norm": 0.14835794270038605, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6212 + }, + { + "epoch": 4.962460063897764, + "grad_norm": 0.14602027833461761, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6213 + }, + { + "epoch": 4.963258785942492, + "grad_norm": 0.162953719496727, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6214 + }, + { + "epoch": 4.964057507987221, + "grad_norm": 0.7214393615722656, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6215 + }, + { + "epoch": 4.964856230031949, + "grad_norm": 0.27030259370803833, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6216 + }, + { + "epoch": 4.965654952076678, + "grad_norm": 0.18558967113494873, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 6217 + }, + { + "epoch": 4.966453674121405, + "grad_norm": 0.09276804327964783, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6218 + }, + { + "epoch": 4.967252396166134, + "grad_norm": 0.11957832425832748, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6219 + }, + { + "epoch": 4.968051118210862, + "grad_norm": 0.8338447213172913, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 6220 + }, + { + "epoch": 4.968849840255591, + "grad_norm": 0.7283904552459717, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 6221 + }, + { + "epoch": 4.9696485623003195, + "grad_norm": 0.07938430458307266, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6222 + }, + { + "epoch": 4.970447284345048, + "grad_norm": 0.15368770062923431, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6223 + }, + { + "epoch": 4.9712460063897765, + "grad_norm": 0.08823438733816147, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6224 + }, + { + "epoch": 4.972044728434505, + "grad_norm": 0.07656054943799973, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6225 + }, + { + "epoch": 4.972843450479234, + "grad_norm": 0.08777901530265808, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6226 + }, + { + "epoch": 4.973642172523961, + "grad_norm": 0.09863653033971786, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6227 + }, + { + "epoch": 4.97444089456869, + "grad_norm": 0.13259904086589813, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6228 + }, + { + "epoch": 4.975239616613418, + "grad_norm": 0.08148759603500366, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6229 + }, + { + "epoch": 4.976038338658147, + "grad_norm": 0.06982999294996262, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6230 + }, + { + "epoch": 4.976837060702875, + "grad_norm": 0.09279565513134003, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6231 + }, + { + "epoch": 4.977635782747604, + "grad_norm": 0.05821947008371353, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6232 + }, + { + "epoch": 4.978434504792332, + "grad_norm": 0.07475738972425461, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6233 + }, + { + "epoch": 4.979233226837061, + "grad_norm": 0.10464147478342056, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6234 + }, + { + "epoch": 4.9800319488817895, + "grad_norm": 0.08045687526464462, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6235 + }, + { + "epoch": 4.980830670926517, + "grad_norm": 0.08045300841331482, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6236 + }, + { + "epoch": 4.981629392971246, + "grad_norm": 0.10313838720321655, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6237 + }, + { + "epoch": 4.982428115015974, + "grad_norm": 0.08065208047628403, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6238 + }, + { + "epoch": 4.983226837060703, + "grad_norm": 0.0807032585144043, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6239 + }, + { + "epoch": 4.984025559105431, + "grad_norm": 0.06274307519197464, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6240 + }, + { + "epoch": 4.98482428115016, + "grad_norm": 0.07299554347991943, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6241 + }, + { + "epoch": 4.985623003194888, + "grad_norm": 0.0592481754720211, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6242 + }, + { + "epoch": 4.986421725239617, + "grad_norm": 0.0766056478023529, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6243 + }, + { + "epoch": 4.987220447284345, + "grad_norm": 0.07707066088914871, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6244 + }, + { + "epoch": 4.988019169329074, + "grad_norm": 0.7231665849685669, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6245 + }, + { + "epoch": 4.988817891373802, + "grad_norm": 0.0678652748465538, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6246 + }, + { + "epoch": 4.98961661341853, + "grad_norm": 3.667872905731201, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6247 + }, + { + "epoch": 4.9904153354632586, + "grad_norm": 0.2416938990354538, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6248 + }, + { + "epoch": 4.991214057507987, + "grad_norm": 0.27054834365844727, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6249 + }, + { + "epoch": 4.992012779552716, + "grad_norm": 0.1435888707637787, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6250 + }, + { + "epoch": 4.992811501597444, + "grad_norm": 0.1542683094739914, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6251 + }, + { + "epoch": 4.993610223642173, + "grad_norm": 0.1867702603340149, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6252 + }, + { + "epoch": 4.994408945686901, + "grad_norm": 0.09558507800102234, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6253 + }, + { + "epoch": 4.99520766773163, + "grad_norm": 0.3019699156284332, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6254 + }, + { + "epoch": 4.996006389776358, + "grad_norm": 0.11987117677927017, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6255 + }, + { + "epoch": 4.996805111821086, + "grad_norm": 0.11792664974927902, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6256 + }, + { + "epoch": 4.997603833865814, + "grad_norm": 0.15580247342586517, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6257 + }, + { + "epoch": 4.998402555910543, + "grad_norm": 0.20167642831802368, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6258 + }, + { + "epoch": 4.9992012779552715, + "grad_norm": 0.11203871667385101, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6259 + }, + { + "epoch": 5.0, + "grad_norm": 0.11081275343894958, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6260 + }, + { + "epoch": 5.0007987220447285, + "grad_norm": 0.11213719099760056, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6261 + }, + { + "epoch": 5.001597444089457, + "grad_norm": 0.11074960231781006, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6262 + }, + { + "epoch": 5.002396166134186, + "grad_norm": 0.07538039237260818, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6263 + }, + { + "epoch": 5.003194888178914, + "grad_norm": 0.0824185386300087, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6264 + }, + { + "epoch": 5.003993610223642, + "grad_norm": 0.08940225094556808, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6265 + }, + { + "epoch": 5.00479233226837, + "grad_norm": 0.07072590291500092, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6266 + }, + { + "epoch": 5.005591054313099, + "grad_norm": 0.13027220964431763, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6267 + }, + { + "epoch": 5.006389776357827, + "grad_norm": 0.09226793050765991, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6268 + }, + { + "epoch": 5.007188498402556, + "grad_norm": 0.1879013329744339, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6269 + }, + { + "epoch": 5.007987220447284, + "grad_norm": 0.09063144028186798, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6270 + }, + { + "epoch": 5.008785942492013, + "grad_norm": 0.09013621509075165, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6271 + }, + { + "epoch": 5.0095846645367414, + "grad_norm": 0.2404542863368988, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6272 + }, + { + "epoch": 5.01038338658147, + "grad_norm": 0.11968059092760086, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6273 + }, + { + "epoch": 5.0111821086261985, + "grad_norm": 0.16429072618484497, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6274 + }, + { + "epoch": 5.011980830670926, + "grad_norm": 0.08745420724153519, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6275 + }, + { + "epoch": 5.012779552715655, + "grad_norm": 0.09130390733480453, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6276 + }, + { + "epoch": 5.013578274760383, + "grad_norm": 0.06996344774961472, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6277 + }, + { + "epoch": 5.014376996805112, + "grad_norm": 0.06063826382160187, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6278 + }, + { + "epoch": 5.01517571884984, + "grad_norm": 0.14752542972564697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6279 + }, + { + "epoch": 5.015974440894569, + "grad_norm": 0.05987429618835449, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6280 + }, + { + "epoch": 5.016773162939297, + "grad_norm": 0.1716211587190628, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6281 + }, + { + "epoch": 5.017571884984026, + "grad_norm": 0.13823190331459045, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6282 + }, + { + "epoch": 5.018370607028754, + "grad_norm": 0.09764201194047928, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6283 + }, + { + "epoch": 5.019169329073482, + "grad_norm": 0.07897874712944031, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6284 + }, + { + "epoch": 5.0199680511182105, + "grad_norm": 0.07823392748832703, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6285 + }, + { + "epoch": 5.020766773162939, + "grad_norm": 0.1033136323094368, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6286 + }, + { + "epoch": 5.021565495207668, + "grad_norm": 0.07100827991962433, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6287 + }, + { + "epoch": 5.022364217252396, + "grad_norm": 0.40211987495422363, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 6288 + }, + { + "epoch": 5.023162939297125, + "grad_norm": 0.15459896624088287, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6289 + }, + { + "epoch": 5.023961661341853, + "grad_norm": 0.07789050787687302, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6290 + }, + { + "epoch": 5.024760383386582, + "grad_norm": 0.2116134762763977, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6291 + }, + { + "epoch": 5.02555910543131, + "grad_norm": 0.1842123568058014, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6292 + }, + { + "epoch": 5.026357827476039, + "grad_norm": 0.2037680447101593, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6293 + }, + { + "epoch": 5.027156549520766, + "grad_norm": 0.10851238667964935, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6294 + }, + { + "epoch": 5.027955271565495, + "grad_norm": 0.14465196430683136, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6295 + }, + { + "epoch": 5.0287539936102235, + "grad_norm": 0.11993128806352615, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6296 + }, + { + "epoch": 5.029552715654952, + "grad_norm": 0.13647349178791046, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6297 + }, + { + "epoch": 5.0303514376996805, + "grad_norm": 0.11265698075294495, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 6298 + }, + { + "epoch": 5.031150159744409, + "grad_norm": 18.601808547973633, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6299 + }, + { + "epoch": 5.031948881789138, + "grad_norm": 0.40079689025878906, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6300 + }, + { + "epoch": 5.032747603833866, + "grad_norm": 3.513967752456665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6301 + }, + { + "epoch": 5.033546325878595, + "grad_norm": 24.040191650390625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6302 + }, + { + "epoch": 5.034345047923322, + "grad_norm": 0.7786405086517334, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6303 + }, + { + "epoch": 5.035143769968051, + "grad_norm": 0.619868814945221, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6304 + }, + { + "epoch": 5.035942492012779, + "grad_norm": 6.039219379425049, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6305 + }, + { + "epoch": 5.036741214057508, + "grad_norm": 23.90920639038086, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 6306 + }, + { + "epoch": 5.037539936102236, + "grad_norm": 1.296809196472168, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 6307 + }, + { + "epoch": 5.038338658146965, + "grad_norm": 0.7673514485359192, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 6308 + }, + { + "epoch": 5.039137380191693, + "grad_norm": 0.5065979957580566, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 6309 + }, + { + "epoch": 5.039936102236422, + "grad_norm": 0.3858639597892761, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 6310 + }, + { + "epoch": 5.0407348242811505, + "grad_norm": 0.2647075653076172, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 6311 + }, + { + "epoch": 5.041533546325879, + "grad_norm": 0.2713094651699066, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 6312 + }, + { + "epoch": 5.042332268370607, + "grad_norm": 0.2573802173137665, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 6313 + }, + { + "epoch": 5.043130990415335, + "grad_norm": 0.2083175778388977, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 6314 + }, + { + "epoch": 5.043929712460064, + "grad_norm": 0.3625626564025879, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 6315 + }, + { + "epoch": 5.044728434504792, + "grad_norm": 0.331129789352417, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 6316 + }, + { + "epoch": 5.045527156549521, + "grad_norm": 0.23352555930614471, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 6317 + }, + { + "epoch": 5.046325878594249, + "grad_norm": 0.24043256044387817, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 6318 + }, + { + "epoch": 5.047124600638978, + "grad_norm": 0.31510207056999207, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 6319 + }, + { + "epoch": 5.047923322683706, + "grad_norm": 0.6896952390670776, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 6320 + }, + { + "epoch": 5.048722044728435, + "grad_norm": 0.7915457487106323, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 6321 + }, + { + "epoch": 5.0495207667731625, + "grad_norm": 0.2959117889404297, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 6322 + }, + { + "epoch": 5.050319488817891, + "grad_norm": 0.44844529032707214, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 6323 + }, + { + "epoch": 5.05111821086262, + "grad_norm": 0.3385697305202484, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 6324 + }, + { + "epoch": 5.051916932907348, + "grad_norm": 0.31220802664756775, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 6325 + }, + { + "epoch": 5.052715654952077, + "grad_norm": 0.3420731723308563, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 6326 + }, + { + "epoch": 5.053514376996805, + "grad_norm": 0.3061322569847107, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 6327 + }, + { + "epoch": 5.054313099041534, + "grad_norm": 0.6878030300140381, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 6328 + }, + { + "epoch": 5.055111821086262, + "grad_norm": 0.1927136927843094, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 6329 + }, + { + "epoch": 5.055910543130991, + "grad_norm": 0.24812163412570953, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6330 + }, + { + "epoch": 5.056709265175719, + "grad_norm": 0.19675321877002716, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6331 + }, + { + "epoch": 5.057507987220447, + "grad_norm": 0.20720984041690826, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6332 + }, + { + "epoch": 5.0583067092651754, + "grad_norm": 0.1260477900505066, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6333 + }, + { + "epoch": 5.059105431309904, + "grad_norm": 0.24399158358573914, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6334 + }, + { + "epoch": 5.0599041533546325, + "grad_norm": 0.22406993806362152, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6335 + }, + { + "epoch": 5.060702875399361, + "grad_norm": 0.24807684123516083, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6336 + }, + { + "epoch": 5.06150159744409, + "grad_norm": 0.1272616684436798, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6337 + }, + { + "epoch": 5.062300319488818, + "grad_norm": 0.2053418755531311, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6338 + }, + { + "epoch": 5.063099041533547, + "grad_norm": 0.13628287613391876, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6339 + }, + { + "epoch": 5.063897763578275, + "grad_norm": 0.21262522041797638, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6340 + }, + { + "epoch": 5.064696485623003, + "grad_norm": 0.3784351646900177, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6341 + }, + { + "epoch": 5.065495207667731, + "grad_norm": 0.3282131552696228, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6342 + }, + { + "epoch": 5.06629392971246, + "grad_norm": 0.10128312557935715, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6343 + }, + { + "epoch": 5.067092651757188, + "grad_norm": 0.2297000139951706, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6344 + }, + { + "epoch": 5.067891373801917, + "grad_norm": 0.11327458173036575, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6345 + }, + { + "epoch": 5.068690095846645, + "grad_norm": 0.16150346398353577, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6346 + }, + { + "epoch": 5.069488817891374, + "grad_norm": 0.15486986935138702, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6347 + }, + { + "epoch": 5.0702875399361025, + "grad_norm": 0.12427826225757599, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6348 + }, + { + "epoch": 5.071086261980831, + "grad_norm": 0.11321424692869186, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6349 + }, + { + "epoch": 5.0718849840255595, + "grad_norm": 0.12668851017951965, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6350 + }, + { + "epoch": 5.072683706070287, + "grad_norm": 0.20059579610824585, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6351 + }, + { + "epoch": 5.073482428115016, + "grad_norm": 0.14591605961322784, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6352 + }, + { + "epoch": 5.074281150159744, + "grad_norm": 0.19168664515018463, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6353 + }, + { + "epoch": 5.075079872204473, + "grad_norm": 0.19381079077720642, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6354 + }, + { + "epoch": 5.075878594249201, + "grad_norm": 0.0957496389746666, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6355 + }, + { + "epoch": 5.07667731629393, + "grad_norm": 0.11414145678281784, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6356 + }, + { + "epoch": 5.077476038338658, + "grad_norm": 0.10855124145746231, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6357 + }, + { + "epoch": 5.078274760383387, + "grad_norm": 0.2300068736076355, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6358 + }, + { + "epoch": 5.079073482428115, + "grad_norm": 0.15098270773887634, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 6359 + }, + { + "epoch": 5.079872204472843, + "grad_norm": 0.09821227937936783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6360 + }, + { + "epoch": 5.080670926517572, + "grad_norm": 0.135583758354187, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6361 + }, + { + "epoch": 5.0814696485623, + "grad_norm": 0.07262608408927917, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6362 + }, + { + "epoch": 5.082268370607029, + "grad_norm": 0.10731761902570724, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6363 + }, + { + "epoch": 5.083067092651757, + "grad_norm": 0.27508556842803955, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6364 + }, + { + "epoch": 5.083865814696486, + "grad_norm": 0.12996995449066162, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6365 + }, + { + "epoch": 5.084664536741214, + "grad_norm": 0.10386788845062256, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6366 + }, + { + "epoch": 5.085463258785943, + "grad_norm": 0.07591816782951355, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6367 + }, + { + "epoch": 5.086261980830671, + "grad_norm": 0.09341761469841003, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6368 + }, + { + "epoch": 5.0870607028754, + "grad_norm": 0.12575088441371918, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6369 + }, + { + "epoch": 5.087859424920127, + "grad_norm": 0.3423956036567688, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 6370 + }, + { + "epoch": 5.088658146964856, + "grad_norm": 0.2154775857925415, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6371 + }, + { + "epoch": 5.0894568690095845, + "grad_norm": 0.1550479382276535, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6372 + }, + { + "epoch": 5.090255591054313, + "grad_norm": 0.08802525699138641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6373 + }, + { + "epoch": 5.0910543130990416, + "grad_norm": 0.08421735465526581, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6374 + }, + { + "epoch": 5.09185303514377, + "grad_norm": 0.08920808881521225, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6375 + }, + { + "epoch": 5.092651757188499, + "grad_norm": 0.1450507938861847, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6376 + }, + { + "epoch": 5.093450479233227, + "grad_norm": 0.16926947236061096, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 6377 + }, + { + "epoch": 5.094249201277956, + "grad_norm": 0.6995428204536438, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6378 + }, + { + "epoch": 5.095047923322683, + "grad_norm": 0.10353969782590866, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6379 + }, + { + "epoch": 5.095846645367412, + "grad_norm": 0.09132180362939835, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6380 + }, + { + "epoch": 5.09664536741214, + "grad_norm": 0.17745476961135864, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6381 + }, + { + "epoch": 5.097444089456869, + "grad_norm": 0.10596930980682373, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6382 + }, + { + "epoch": 5.098242811501597, + "grad_norm": 0.11676348745822906, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6383 + }, + { + "epoch": 5.099041533546326, + "grad_norm": 0.13022664189338684, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6384 + }, + { + "epoch": 5.0998402555910545, + "grad_norm": 0.11169753223657608, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6385 + }, + { + "epoch": 5.100638977635783, + "grad_norm": 0.07439867407083511, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6386 + }, + { + "epoch": 5.1014376996805115, + "grad_norm": 0.06953777372837067, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6387 + }, + { + "epoch": 5.102236421725239, + "grad_norm": 0.09419669955968857, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6388 + }, + { + "epoch": 5.103035143769968, + "grad_norm": 0.1166587546467781, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6389 + }, + { + "epoch": 5.103833865814696, + "grad_norm": 0.5776185393333435, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6390 + }, + { + "epoch": 5.104632587859425, + "grad_norm": 0.13175810873508453, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6391 + }, + { + "epoch": 5.105431309904153, + "grad_norm": 0.09372890740633011, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6392 + }, + { + "epoch": 5.106230031948882, + "grad_norm": 0.25262513756752014, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6393 + }, + { + "epoch": 5.10702875399361, + "grad_norm": 0.1348644196987152, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6394 + }, + { + "epoch": 5.107827476038339, + "grad_norm": 0.23879335820674896, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6395 + }, + { + "epoch": 5.108626198083067, + "grad_norm": 0.25561729073524475, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6396 + }, + { + "epoch": 5.109424920127796, + "grad_norm": 0.26974916458129883, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6397 + }, + { + "epoch": 5.110223642172524, + "grad_norm": 0.1866329163312912, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6398 + }, + { + "epoch": 5.111022364217252, + "grad_norm": 0.22104737162590027, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6399 + }, + { + "epoch": 5.111821086261981, + "grad_norm": 0.3775753676891327, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6400 + }, + { + "epoch": 5.112619808306709, + "grad_norm": 0.20636002719402313, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6401 + }, + { + "epoch": 5.113418530351438, + "grad_norm": 0.1941772699356079, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6402 + }, + { + "epoch": 5.114217252396166, + "grad_norm": 0.14595480263233185, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6403 + }, + { + "epoch": 5.115015974440895, + "grad_norm": 0.16794493794441223, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6404 + }, + { + "epoch": 5.115814696485623, + "grad_norm": 0.16466112434864044, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6405 + }, + { + "epoch": 5.116613418530352, + "grad_norm": 0.27192312479019165, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6406 + }, + { + "epoch": 5.11741214057508, + "grad_norm": 0.296017050743103, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6407 + }, + { + "epoch": 5.118210862619808, + "grad_norm": 0.24947655200958252, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6408 + }, + { + "epoch": 5.1190095846645365, + "grad_norm": 0.07843278348445892, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6409 + }, + { + "epoch": 5.119808306709265, + "grad_norm": 0.2507891356945038, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6410 + }, + { + "epoch": 5.1206070287539935, + "grad_norm": 0.2962022125720978, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6411 + }, + { + "epoch": 5.121405750798722, + "grad_norm": 0.21588601171970367, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6412 + }, + { + "epoch": 5.122204472843451, + "grad_norm": 0.27223092317581177, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6413 + }, + { + "epoch": 5.123003194888179, + "grad_norm": 0.1475650519132614, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6414 + }, + { + "epoch": 5.123801916932908, + "grad_norm": 0.2624805271625519, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6415 + }, + { + "epoch": 5.124600638977636, + "grad_norm": 0.27691081166267395, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6416 + }, + { + "epoch": 5.125399361022364, + "grad_norm": 0.1828494369983673, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6417 + }, + { + "epoch": 5.126198083067092, + "grad_norm": 0.27542614936828613, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 6418 + }, + { + "epoch": 5.126996805111821, + "grad_norm": 0.16250371932983398, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6419 + }, + { + "epoch": 5.127795527156549, + "grad_norm": 0.17180733382701874, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6420 + }, + { + "epoch": 5.128594249201278, + "grad_norm": 0.21466004848480225, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6421 + }, + { + "epoch": 5.1293929712460065, + "grad_norm": 0.13144539296627045, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6422 + }, + { + "epoch": 5.130191693290735, + "grad_norm": 0.158688023686409, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6423 + }, + { + "epoch": 5.1309904153354635, + "grad_norm": 0.1430175006389618, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6424 + }, + { + "epoch": 5.131789137380192, + "grad_norm": 0.0988554134964943, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6425 + }, + { + "epoch": 5.13258785942492, + "grad_norm": 0.18320757150650024, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6426 + }, + { + "epoch": 5.133386581469648, + "grad_norm": 0.34172165393829346, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6427 + }, + { + "epoch": 5.134185303514377, + "grad_norm": 0.095450758934021, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6428 + }, + { + "epoch": 5.134984025559105, + "grad_norm": 0.2988479733467102, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6429 + }, + { + "epoch": 5.135782747603834, + "grad_norm": 0.11462085694074631, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6430 + }, + { + "epoch": 5.136581469648562, + "grad_norm": 0.11989153176546097, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6431 + }, + { + "epoch": 5.137380191693291, + "grad_norm": 0.15308552980422974, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6432 + }, + { + "epoch": 5.138178913738019, + "grad_norm": 0.1119944304227829, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6433 + }, + { + "epoch": 5.138977635782748, + "grad_norm": 0.38812172412872314, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6434 + }, + { + "epoch": 5.139776357827476, + "grad_norm": 0.24718649685382843, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6435 + }, + { + "epoch": 5.140575079872204, + "grad_norm": 0.15834778547286987, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6436 + }, + { + "epoch": 5.141373801916933, + "grad_norm": 0.1960451751947403, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6437 + }, + { + "epoch": 5.142172523961661, + "grad_norm": 0.16195416450500488, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6438 + }, + { + "epoch": 5.14297124600639, + "grad_norm": 0.07554367184638977, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6439 + }, + { + "epoch": 5.143769968051118, + "grad_norm": 0.18924687802791595, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6440 + }, + { + "epoch": 5.144568690095847, + "grad_norm": 0.16253480315208435, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6441 + }, + { + "epoch": 5.145367412140575, + "grad_norm": 0.12711918354034424, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6442 + }, + { + "epoch": 5.146166134185304, + "grad_norm": 0.16831086575984955, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6443 + }, + { + "epoch": 5.146964856230032, + "grad_norm": 0.35199087858200073, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6444 + }, + { + "epoch": 5.147763578274761, + "grad_norm": 0.1340232491493225, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6445 + }, + { + "epoch": 5.1485623003194885, + "grad_norm": 0.1397274285554886, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6446 + }, + { + "epoch": 5.149361022364217, + "grad_norm": 0.13868366181850433, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6447 + }, + { + "epoch": 5.1501597444089455, + "grad_norm": 0.08846192806959152, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6448 + }, + { + "epoch": 5.150958466453674, + "grad_norm": 0.08350610733032227, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6449 + }, + { + "epoch": 5.151757188498403, + "grad_norm": 0.14727875590324402, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6450 + }, + { + "epoch": 5.152555910543131, + "grad_norm": 0.11705708503723145, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6451 + }, + { + "epoch": 5.15335463258786, + "grad_norm": 0.10308192670345306, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6452 + }, + { + "epoch": 5.154153354632588, + "grad_norm": 0.09459209442138672, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6453 + }, + { + "epoch": 5.154952076677317, + "grad_norm": 0.11605191230773926, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6454 + }, + { + "epoch": 5.155750798722044, + "grad_norm": 0.24275821447372437, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6455 + }, + { + "epoch": 5.156549520766773, + "grad_norm": 0.208640456199646, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6456 + }, + { + "epoch": 5.157348242811501, + "grad_norm": 0.15257662534713745, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6457 + }, + { + "epoch": 5.15814696485623, + "grad_norm": 0.10431355237960815, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6458 + }, + { + "epoch": 5.1589456869009584, + "grad_norm": 0.14187589287757874, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6459 + }, + { + "epoch": 5.159744408945687, + "grad_norm": 0.19084404408931732, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6460 + }, + { + "epoch": 5.1605431309904155, + "grad_norm": 0.09255128353834152, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6461 + }, + { + "epoch": 5.161341853035144, + "grad_norm": 0.1443471759557724, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6462 + }, + { + "epoch": 5.162140575079873, + "grad_norm": 0.36597245931625366, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6463 + }, + { + "epoch": 5.1629392971246, + "grad_norm": 0.3835389316082001, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6464 + }, + { + "epoch": 5.163738019169329, + "grad_norm": 0.14208771288394928, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6465 + }, + { + "epoch": 5.164536741214057, + "grad_norm": 0.2520706355571747, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6466 + }, + { + "epoch": 5.165335463258786, + "grad_norm": 0.2595224976539612, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6467 + }, + { + "epoch": 5.166134185303514, + "grad_norm": 0.15721063315868378, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6468 + }, + { + "epoch": 5.166932907348243, + "grad_norm": 0.1772007793188095, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6469 + }, + { + "epoch": 5.167731629392971, + "grad_norm": 0.19899888336658478, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6470 + }, + { + "epoch": 5.1685303514377, + "grad_norm": 0.18689346313476562, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6471 + }, + { + "epoch": 5.169329073482428, + "grad_norm": 0.16748468577861786, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6472 + }, + { + "epoch": 5.170127795527157, + "grad_norm": 0.13296879827976227, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6473 + }, + { + "epoch": 5.170926517571885, + "grad_norm": 0.18742166459560394, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6474 + }, + { + "epoch": 5.171725239616613, + "grad_norm": 0.17811308801174164, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6475 + }, + { + "epoch": 5.172523961661342, + "grad_norm": 0.1360485702753067, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 6476 + }, + { + "epoch": 5.17332268370607, + "grad_norm": 0.13431121408939362, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6477 + }, + { + "epoch": 5.174121405750799, + "grad_norm": 0.12888069450855255, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6478 + }, + { + "epoch": 5.174920127795527, + "grad_norm": 0.15194712579250336, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6479 + }, + { + "epoch": 5.175718849840256, + "grad_norm": 0.13076889514923096, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6480 + }, + { + "epoch": 5.176517571884984, + "grad_norm": 0.14751110970973969, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6481 + }, + { + "epoch": 5.177316293929713, + "grad_norm": 0.11919333785772324, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6482 + }, + { + "epoch": 5.178115015974441, + "grad_norm": 0.12712688744068146, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6483 + }, + { + "epoch": 5.178913738019169, + "grad_norm": 0.13765369355678558, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 6484 + }, + { + "epoch": 5.1797124600638975, + "grad_norm": 0.11060373485088348, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6485 + }, + { + "epoch": 5.180511182108626, + "grad_norm": 0.056882213801145554, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6486 + }, + { + "epoch": 5.181309904153355, + "grad_norm": 0.11317770928144455, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6487 + }, + { + "epoch": 5.182108626198083, + "grad_norm": 0.09279809147119522, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6488 + }, + { + "epoch": 5.182907348242812, + "grad_norm": 0.09392786771059036, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6489 + }, + { + "epoch": 5.18370607028754, + "grad_norm": 0.13042815029621124, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6490 + }, + { + "epoch": 5.184504792332269, + "grad_norm": 0.07929978519678116, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6491 + }, + { + "epoch": 5.185303514376997, + "grad_norm": 0.12215851992368698, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6492 + }, + { + "epoch": 5.186102236421725, + "grad_norm": 0.12000773102045059, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6493 + }, + { + "epoch": 5.186900958466453, + "grad_norm": 0.08427707850933075, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6494 + }, + { + "epoch": 5.187699680511182, + "grad_norm": 0.158653125166893, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6495 + }, + { + "epoch": 5.18849840255591, + "grad_norm": 0.11087878793478012, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6496 + }, + { + "epoch": 5.189297124600639, + "grad_norm": 0.12649668753147125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6497 + }, + { + "epoch": 5.1900958466453675, + "grad_norm": 0.0821281224489212, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6498 + }, + { + "epoch": 5.190894568690096, + "grad_norm": 0.07192671298980713, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6499 + }, + { + "epoch": 5.1916932907348246, + "grad_norm": 0.10505214333534241, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6500 + }, + { + "epoch": 5.192492012779553, + "grad_norm": 0.11772353947162628, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6501 + }, + { + "epoch": 5.193290734824281, + "grad_norm": 0.15557901561260223, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6502 + }, + { + "epoch": 5.194089456869009, + "grad_norm": 0.09753020852804184, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6503 + }, + { + "epoch": 5.194888178913738, + "grad_norm": 0.10331830382347107, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6504 + }, + { + "epoch": 5.195686900958466, + "grad_norm": 0.130085289478302, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6505 + }, + { + "epoch": 5.196485623003195, + "grad_norm": 0.08772018551826477, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6506 + }, + { + "epoch": 5.197284345047923, + "grad_norm": 0.1906667798757553, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6507 + }, + { + "epoch": 5.198083067092652, + "grad_norm": 0.06724394112825394, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6508 + }, + { + "epoch": 5.19888178913738, + "grad_norm": 0.1141325905919075, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6509 + }, + { + "epoch": 5.199680511182109, + "grad_norm": 0.08354665338993073, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6510 + }, + { + "epoch": 5.2004792332268375, + "grad_norm": 0.1072440817952156, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6511 + }, + { + "epoch": 5.201277955271565, + "grad_norm": 0.10670839250087738, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6512 + }, + { + "epoch": 5.202076677316294, + "grad_norm": 0.10079781711101532, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6513 + }, + { + "epoch": 5.202875399361022, + "grad_norm": 0.1281125396490097, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6514 + }, + { + "epoch": 5.203674121405751, + "grad_norm": 0.1627720147371292, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6515 + }, + { + "epoch": 5.204472843450479, + "grad_norm": 0.1507575958967209, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6516 + }, + { + "epoch": 5.205271565495208, + "grad_norm": 0.17764779925346375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6517 + }, + { + "epoch": 5.206070287539936, + "grad_norm": 0.1825307011604309, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6518 + }, + { + "epoch": 5.206869009584665, + "grad_norm": 0.1151907742023468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6519 + }, + { + "epoch": 5.207667731629393, + "grad_norm": 0.1425708830356598, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6520 + }, + { + "epoch": 5.208466453674121, + "grad_norm": 0.08555550873279572, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6521 + }, + { + "epoch": 5.2092651757188495, + "grad_norm": 0.15400084853172302, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6522 + }, + { + "epoch": 5.210063897763578, + "grad_norm": 0.11088921129703522, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6523 + }, + { + "epoch": 5.210862619808307, + "grad_norm": 0.0959518551826477, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6524 + }, + { + "epoch": 5.211661341853035, + "grad_norm": 0.1054866686463356, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6525 + }, + { + "epoch": 5.212460063897764, + "grad_norm": 0.17849107086658478, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6526 + }, + { + "epoch": 5.213258785942492, + "grad_norm": 0.0910423994064331, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6527 + }, + { + "epoch": 5.214057507987221, + "grad_norm": 0.10857872664928436, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6528 + }, + { + "epoch": 5.214856230031949, + "grad_norm": 0.09012399613857269, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6529 + }, + { + "epoch": 5.215654952076678, + "grad_norm": 0.14724178612232208, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6530 + }, + { + "epoch": 5.216453674121405, + "grad_norm": 0.11357409507036209, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6531 + }, + { + "epoch": 5.217252396166134, + "grad_norm": 0.09721364825963974, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6532 + }, + { + "epoch": 5.218051118210862, + "grad_norm": 0.07837430387735367, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6533 + }, + { + "epoch": 5.218849840255591, + "grad_norm": 0.1181735098361969, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6534 + }, + { + "epoch": 5.2196485623003195, + "grad_norm": 0.07066017389297485, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6535 + }, + { + "epoch": 5.220447284345048, + "grad_norm": 0.06838417053222656, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6536 + }, + { + "epoch": 5.2212460063897765, + "grad_norm": 0.0919245257973671, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6537 + }, + { + "epoch": 5.222044728434505, + "grad_norm": 0.06859984248876572, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6538 + }, + { + "epoch": 5.222843450479234, + "grad_norm": 1.929213523864746, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6539 + }, + { + "epoch": 5.223642172523961, + "grad_norm": 0.11181562393903732, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6540 + }, + { + "epoch": 5.22444089456869, + "grad_norm": 0.09261998534202576, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6541 + }, + { + "epoch": 5.225239616613418, + "grad_norm": 0.11214403063058853, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6542 + }, + { + "epoch": 5.226038338658147, + "grad_norm": 0.1353820264339447, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6543 + }, + { + "epoch": 5.226837060702875, + "grad_norm": 0.11579953879117966, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 6544 + }, + { + "epoch": 5.227635782747604, + "grad_norm": 0.08284885436296463, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6545 + }, + { + "epoch": 5.228434504792332, + "grad_norm": 0.13805733621120453, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6546 + }, + { + "epoch": 5.229233226837061, + "grad_norm": 0.08924185484647751, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6547 + }, + { + "epoch": 5.2300319488817895, + "grad_norm": 0.10975285619497299, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6548 + }, + { + "epoch": 5.230830670926518, + "grad_norm": 0.10500271618366241, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6549 + }, + { + "epoch": 5.231629392971246, + "grad_norm": 0.09947814792394638, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 5.232428115015974, + "grad_norm": 0.10113594681024551, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6551 + }, + { + "epoch": 5.233226837060703, + "grad_norm": 0.12645265460014343, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6552 + }, + { + "epoch": 5.234025559105431, + "grad_norm": 0.06775741279125214, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6553 + }, + { + "epoch": 5.23482428115016, + "grad_norm": 0.09799529612064362, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6554 + }, + { + "epoch": 5.235623003194888, + "grad_norm": 0.13129538297653198, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6555 + }, + { + "epoch": 5.236421725239617, + "grad_norm": 0.10139735788106918, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6556 + }, + { + "epoch": 5.237220447284345, + "grad_norm": 0.13819058239459991, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6557 + }, + { + "epoch": 5.238019169329074, + "grad_norm": 0.09306512027978897, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6558 + }, + { + "epoch": 5.2388178913738015, + "grad_norm": 0.07963602244853973, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6559 + }, + { + "epoch": 5.23961661341853, + "grad_norm": 0.12864448130130768, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6560 + }, + { + "epoch": 5.2404153354632586, + "grad_norm": 0.1044403612613678, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6561 + }, + { + "epoch": 5.241214057507987, + "grad_norm": 0.07623843848705292, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6562 + }, + { + "epoch": 5.242012779552716, + "grad_norm": 0.10385097563266754, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6563 + }, + { + "epoch": 5.242811501597444, + "grad_norm": 0.07048188149929047, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6564 + }, + { + "epoch": 5.243610223642173, + "grad_norm": 0.25789955258369446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6565 + }, + { + "epoch": 5.244408945686901, + "grad_norm": 0.12271685153245926, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6566 + }, + { + "epoch": 5.24520766773163, + "grad_norm": 0.10512058436870575, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6567 + }, + { + "epoch": 5.246006389776358, + "grad_norm": 0.07663438469171524, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6568 + }, + { + "epoch": 5.246805111821086, + "grad_norm": 0.09937599301338196, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6569 + }, + { + "epoch": 5.247603833865814, + "grad_norm": 0.12242338061332703, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6570 + }, + { + "epoch": 5.248402555910543, + "grad_norm": 0.1733475625514984, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6571 + }, + { + "epoch": 5.2492012779552715, + "grad_norm": 0.1460944414138794, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6572 + }, + { + "epoch": 5.25, + "grad_norm": 0.09406521171331406, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6573 + }, + { + "epoch": 5.2507987220447285, + "grad_norm": 1.0146688222885132, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6574 + }, + { + "epoch": 5.251597444089457, + "grad_norm": 0.10557705909013748, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6575 + }, + { + "epoch": 5.252396166134186, + "grad_norm": 0.1306990385055542, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6576 + }, + { + "epoch": 5.253194888178914, + "grad_norm": 0.094961017370224, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6577 + }, + { + "epoch": 5.253993610223642, + "grad_norm": 0.13421863317489624, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6578 + }, + { + "epoch": 5.25479233226837, + "grad_norm": 0.12371776252985, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6579 + }, + { + "epoch": 5.255591054313099, + "grad_norm": 0.15863509476184845, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6580 + }, + { + "epoch": 5.256389776357827, + "grad_norm": 0.1156599149107933, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6581 + }, + { + "epoch": 5.257188498402556, + "grad_norm": 0.07102219015359879, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6582 + }, + { + "epoch": 5.257987220447284, + "grad_norm": 0.09030039608478546, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6583 + }, + { + "epoch": 5.258785942492013, + "grad_norm": 0.08848102390766144, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6584 + }, + { + "epoch": 5.2595846645367414, + "grad_norm": 0.07455430924892426, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6585 + }, + { + "epoch": 5.26038338658147, + "grad_norm": 0.07729559391736984, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6586 + }, + { + "epoch": 5.261182108626198, + "grad_norm": 0.0955357626080513, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6587 + }, + { + "epoch": 5.261980830670926, + "grad_norm": 0.08680911362171173, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6588 + }, + { + "epoch": 5.262779552715655, + "grad_norm": 0.1033414825797081, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6589 + }, + { + "epoch": 5.263578274760383, + "grad_norm": 0.09428979456424713, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6590 + }, + { + "epoch": 5.264376996805112, + "grad_norm": 0.07567942887544632, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6591 + }, + { + "epoch": 5.26517571884984, + "grad_norm": 0.221647247672081, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6592 + }, + { + "epoch": 5.265974440894569, + "grad_norm": 0.13839758932590485, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6593 + }, + { + "epoch": 5.266773162939297, + "grad_norm": 0.06060291454195976, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6594 + }, + { + "epoch": 5.267571884984026, + "grad_norm": 0.09146185964345932, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6595 + }, + { + "epoch": 5.268370607028754, + "grad_norm": 0.05557526275515556, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6596 + }, + { + "epoch": 5.269169329073483, + "grad_norm": 0.10190495103597641, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6597 + }, + { + "epoch": 5.2699680511182105, + "grad_norm": 0.07389659434556961, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6598 + }, + { + "epoch": 5.270766773162939, + "grad_norm": 0.11124115437269211, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6599 + }, + { + "epoch": 5.271565495207668, + "grad_norm": 0.10779515653848648, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6600 + }, + { + "epoch": 5.272364217252396, + "grad_norm": 0.09347773343324661, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6601 + }, + { + "epoch": 5.273162939297125, + "grad_norm": 0.15056683123111725, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6602 + }, + { + "epoch": 5.273961661341853, + "grad_norm": 0.1398572027683258, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6603 + }, + { + "epoch": 5.274760383386582, + "grad_norm": 0.08360682427883148, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6604 + }, + { + "epoch": 5.27555910543131, + "grad_norm": 0.10360747575759888, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6605 + }, + { + "epoch": 5.276357827476039, + "grad_norm": 0.0864897072315216, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6606 + }, + { + "epoch": 5.277156549520766, + "grad_norm": 0.11505412310361862, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6607 + }, + { + "epoch": 5.277955271565495, + "grad_norm": 0.10638110339641571, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6608 + }, + { + "epoch": 5.2787539936102235, + "grad_norm": 0.08349479734897614, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6609 + }, + { + "epoch": 5.279552715654952, + "grad_norm": 0.14465951919555664, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6610 + }, + { + "epoch": 5.2803514376996805, + "grad_norm": 0.08049577474594116, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6611 + }, + { + "epoch": 5.281150159744409, + "grad_norm": 0.10206092149019241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6612 + }, + { + "epoch": 5.281948881789138, + "grad_norm": 0.2721571922302246, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6613 + }, + { + "epoch": 5.282747603833866, + "grad_norm": 0.17503346502780914, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6614 + }, + { + "epoch": 5.283546325878595, + "grad_norm": 0.11459292471408844, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6615 + }, + { + "epoch": 5.284345047923322, + "grad_norm": 0.9974967241287231, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6616 + }, + { + "epoch": 5.285143769968051, + "grad_norm": 0.11502816528081894, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6617 + }, + { + "epoch": 5.285942492012779, + "grad_norm": 0.12992256879806519, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6618 + }, + { + "epoch": 5.286741214057508, + "grad_norm": 0.19872024655342102, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6619 + }, + { + "epoch": 5.287539936102236, + "grad_norm": 0.13013097643852234, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6620 + }, + { + "epoch": 5.288338658146965, + "grad_norm": 0.13644525408744812, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6621 + }, + { + "epoch": 5.289137380191693, + "grad_norm": 0.15101996064186096, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6622 + }, + { + "epoch": 5.289936102236422, + "grad_norm": 0.11075131595134735, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6623 + }, + { + "epoch": 5.2907348242811505, + "grad_norm": 0.0904511958360672, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6624 + }, + { + "epoch": 5.291533546325878, + "grad_norm": 0.08861460536718369, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6625 + }, + { + "epoch": 5.292332268370607, + "grad_norm": 0.10443824529647827, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6626 + }, + { + "epoch": 5.293130990415335, + "grad_norm": 0.07440674304962158, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6627 + }, + { + "epoch": 5.293929712460064, + "grad_norm": 0.21709975600242615, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6628 + }, + { + "epoch": 5.294728434504792, + "grad_norm": 0.1281055063009262, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6629 + }, + { + "epoch": 5.295527156549521, + "grad_norm": 0.10365202277898788, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6630 + }, + { + "epoch": 5.296325878594249, + "grad_norm": 1.004258632659912, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6631 + }, + { + "epoch": 5.297124600638978, + "grad_norm": 0.16660870611667633, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6632 + }, + { + "epoch": 5.297923322683706, + "grad_norm": 0.1146734207868576, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6633 + }, + { + "epoch": 5.298722044728435, + "grad_norm": 0.18288104236125946, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6634 + }, + { + "epoch": 5.2995207667731625, + "grad_norm": 0.11469347029924393, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6635 + }, + { + "epoch": 5.300319488817891, + "grad_norm": 0.1333407461643219, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6636 + }, + { + "epoch": 5.30111821086262, + "grad_norm": 0.15359243750572205, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6637 + }, + { + "epoch": 5.301916932907348, + "grad_norm": 0.0832027792930603, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6638 + }, + { + "epoch": 5.302715654952077, + "grad_norm": 0.10231718420982361, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6639 + }, + { + "epoch": 5.303514376996805, + "grad_norm": 0.11031626909971237, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6640 + }, + { + "epoch": 5.304313099041534, + "grad_norm": 0.08014792948961258, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6641 + }, + { + "epoch": 5.305111821086262, + "grad_norm": 0.10066475719213486, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6642 + }, + { + "epoch": 5.305910543130991, + "grad_norm": 0.12824396789073944, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6643 + }, + { + "epoch": 5.306709265175719, + "grad_norm": 0.09452345222234726, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6644 + }, + { + "epoch": 5.307507987220447, + "grad_norm": 0.09100557118654251, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6645 + }, + { + "epoch": 5.3083067092651754, + "grad_norm": 0.07995713502168655, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6646 + }, + { + "epoch": 5.309105431309904, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6647 + }, + { + "epoch": 5.3099041533546325, + "grad_norm": 0.09881234914064407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6648 + }, + { + "epoch": 5.310702875399361, + "grad_norm": 0.08131393790245056, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6649 + }, + { + "epoch": 5.31150159744409, + "grad_norm": 0.08842889964580536, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6650 + }, + { + "epoch": 5.312300319488818, + "grad_norm": 0.12630115449428558, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6651 + }, + { + "epoch": 5.313099041533547, + "grad_norm": 0.13429711759090424, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6652 + }, + { + "epoch": 5.313897763578275, + "grad_norm": 0.11347261816263199, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6653 + }, + { + "epoch": 5.314696485623003, + "grad_norm": 0.1555728167295456, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6654 + }, + { + "epoch": 5.315495207667731, + "grad_norm": 0.13184282183647156, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6655 + }, + { + "epoch": 5.31629392971246, + "grad_norm": 0.07821093499660492, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6656 + }, + { + "epoch": 5.317092651757188, + "grad_norm": 0.1300499588251114, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6657 + }, + { + "epoch": 5.317891373801917, + "grad_norm": 0.14896781742572784, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6658 + }, + { + "epoch": 5.318690095846645, + "grad_norm": 0.13370175659656525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6659 + }, + { + "epoch": 5.319488817891374, + "grad_norm": 0.14055652916431427, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6660 + }, + { + "epoch": 5.3202875399361025, + "grad_norm": 0.11674464493989944, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6661 + }, + { + "epoch": 5.321086261980831, + "grad_norm": 0.13155756890773773, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6662 + }, + { + "epoch": 5.321884984025559, + "grad_norm": 0.09616535156965256, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6663 + }, + { + "epoch": 5.322683706070287, + "grad_norm": 0.4228188991546631, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6664 + }, + { + "epoch": 5.323482428115016, + "grad_norm": 0.10942913591861725, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6665 + }, + { + "epoch": 5.324281150159744, + "grad_norm": 0.15592730045318604, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6666 + }, + { + "epoch": 5.325079872204473, + "grad_norm": 0.16837753355503082, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6667 + }, + { + "epoch": 5.325878594249201, + "grad_norm": 0.10512012243270874, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6668 + }, + { + "epoch": 5.32667731629393, + "grad_norm": 0.10834471136331558, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6669 + }, + { + "epoch": 5.327476038338658, + "grad_norm": 0.06588451564311981, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6670 + }, + { + "epoch": 5.328274760383387, + "grad_norm": 0.08714822679758072, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6671 + }, + { + "epoch": 5.329073482428115, + "grad_norm": 0.16129685938358307, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6672 + }, + { + "epoch": 5.329872204472843, + "grad_norm": 0.09294751286506653, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6673 + }, + { + "epoch": 5.330670926517572, + "grad_norm": 0.09905052185058594, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6674 + }, + { + "epoch": 5.3314696485623, + "grad_norm": 0.14584603905677795, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6675 + }, + { + "epoch": 5.332268370607029, + "grad_norm": 0.08384378254413605, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6676 + }, + { + "epoch": 5.333067092651757, + "grad_norm": 0.1672045886516571, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6677 + }, + { + "epoch": 5.333865814696486, + "grad_norm": 0.21656489372253418, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6678 + }, + { + "epoch": 5.334664536741214, + "grad_norm": 0.17034684121608734, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6679 + }, + { + "epoch": 5.335463258785943, + "grad_norm": 0.3153417408466339, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6680 + }, + { + "epoch": 5.336261980830671, + "grad_norm": 0.1953393816947937, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6681 + }, + { + "epoch": 5.3370607028754, + "grad_norm": 0.2085847705602646, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6682 + }, + { + "epoch": 5.337859424920127, + "grad_norm": 0.2679558992385864, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6683 + }, + { + "epoch": 5.338658146964856, + "grad_norm": 0.08705966919660568, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6684 + }, + { + "epoch": 5.3394568690095845, + "grad_norm": 0.09011410176753998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6685 + }, + { + "epoch": 5.340255591054313, + "grad_norm": 0.10358326137065887, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6686 + }, + { + "epoch": 5.3410543130990416, + "grad_norm": 0.08191518485546112, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6687 + }, + { + "epoch": 5.34185303514377, + "grad_norm": 0.0676165446639061, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6688 + }, + { + "epoch": 5.342651757188499, + "grad_norm": 0.18006695806980133, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6689 + }, + { + "epoch": 5.343450479233227, + "grad_norm": 0.11935598403215408, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.344249201277956, + "grad_norm": 0.14136075973510742, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6691 + }, + { + "epoch": 5.345047923322683, + "grad_norm": 0.19367988407611847, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6692 + }, + { + "epoch": 5.345846645367412, + "grad_norm": 0.1283622533082962, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6693 + }, + { + "epoch": 5.34664536741214, + "grad_norm": 0.11303326487541199, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6694 + }, + { + "epoch": 5.347444089456869, + "grad_norm": 0.09076731652021408, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6695 + }, + { + "epoch": 5.348242811501597, + "grad_norm": 0.12625159323215485, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6696 + }, + { + "epoch": 5.349041533546326, + "grad_norm": 0.18254370987415314, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6697 + }, + { + "epoch": 5.3498402555910545, + "grad_norm": 0.12221173942089081, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6698 + }, + { + "epoch": 5.350638977635783, + "grad_norm": 0.11586996912956238, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6699 + }, + { + "epoch": 5.3514376996805115, + "grad_norm": 0.1012619286775589, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6700 + }, + { + "epoch": 5.352236421725239, + "grad_norm": 0.10728003084659576, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6701 + }, + { + "epoch": 5.353035143769968, + "grad_norm": 0.08077894896268845, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6702 + }, + { + "epoch": 5.353833865814696, + "grad_norm": 0.10069102048873901, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6703 + }, + { + "epoch": 5.354632587859425, + "grad_norm": 0.11007717996835709, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6704 + }, + { + "epoch": 5.355431309904153, + "grad_norm": 0.08088147640228271, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6705 + }, + { + "epoch": 5.356230031948882, + "grad_norm": 0.06969337165355682, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6706 + }, + { + "epoch": 5.35702875399361, + "grad_norm": 0.09731647372245789, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6707 + }, + { + "epoch": 5.357827476038339, + "grad_norm": 0.07404995709657669, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6708 + }, + { + "epoch": 5.358626198083067, + "grad_norm": 0.09361755102872849, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6709 + }, + { + "epoch": 5.359424920127796, + "grad_norm": 0.11929210275411606, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6710 + }, + { + "epoch": 5.360223642172524, + "grad_norm": 0.11107892543077469, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6711 + }, + { + "epoch": 5.361022364217252, + "grad_norm": 0.10966535657644272, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6712 + }, + { + "epoch": 5.361821086261981, + "grad_norm": 0.11830565333366394, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6713 + }, + { + "epoch": 5.362619808306709, + "grad_norm": 0.15130563080310822, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6714 + }, + { + "epoch": 5.363418530351438, + "grad_norm": 0.12608309090137482, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6715 + }, + { + "epoch": 5.364217252396166, + "grad_norm": 0.10768693685531616, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6716 + }, + { + "epoch": 5.365015974440895, + "grad_norm": 0.10020256787538528, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6717 + }, + { + "epoch": 5.365814696485623, + "grad_norm": 0.11352406442165375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6718 + }, + { + "epoch": 5.366613418530352, + "grad_norm": 0.10058535635471344, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6719 + }, + { + "epoch": 5.36741214057508, + "grad_norm": 0.08427922427654266, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6720 + }, + { + "epoch": 5.368210862619808, + "grad_norm": 0.08600196242332458, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6721 + }, + { + "epoch": 5.3690095846645365, + "grad_norm": 0.0891844630241394, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 6722 + }, + { + "epoch": 5.369808306709265, + "grad_norm": 0.07231339812278748, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6723 + }, + { + "epoch": 5.3706070287539935, + "grad_norm": 0.0866503193974495, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6724 + }, + { + "epoch": 5.371405750798722, + "grad_norm": 0.44905656576156616, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6725 + }, + { + "epoch": 5.372204472843451, + "grad_norm": 0.2192242592573166, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6726 + }, + { + "epoch": 5.373003194888179, + "grad_norm": 0.15841859579086304, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6727 + }, + { + "epoch": 5.373801916932908, + "grad_norm": 0.1254468858242035, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6728 + }, + { + "epoch": 5.374600638977636, + "grad_norm": 1.5675911903381348, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6729 + }, + { + "epoch": 5.375399361022364, + "grad_norm": 0.20507164299488068, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6730 + }, + { + "epoch": 5.376198083067092, + "grad_norm": 0.26948630809783936, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6731 + }, + { + "epoch": 5.376996805111821, + "grad_norm": 0.15447315573692322, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6732 + }, + { + "epoch": 5.377795527156549, + "grad_norm": 0.17888243496418, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6733 + }, + { + "epoch": 5.378594249201278, + "grad_norm": 0.24683290719985962, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6734 + }, + { + "epoch": 5.3793929712460065, + "grad_norm": 0.15786881744861603, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6735 + }, + { + "epoch": 5.380191693290735, + "grad_norm": 0.18426702916622162, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6736 + }, + { + "epoch": 5.3809904153354635, + "grad_norm": 0.14444448053836823, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6737 + }, + { + "epoch": 5.381789137380192, + "grad_norm": 0.135011225938797, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6738 + }, + { + "epoch": 5.38258785942492, + "grad_norm": 0.19057826697826385, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6739 + }, + { + "epoch": 5.383386581469648, + "grad_norm": 0.12282486259937286, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6740 + }, + { + "epoch": 5.384185303514377, + "grad_norm": 0.17092294991016388, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6741 + }, + { + "epoch": 5.384984025559105, + "grad_norm": 0.19800473749637604, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6742 + }, + { + "epoch": 5.385782747603834, + "grad_norm": 0.07987766712903976, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6743 + }, + { + "epoch": 5.386581469648562, + "grad_norm": 0.18386386334896088, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6744 + }, + { + "epoch": 5.387380191693291, + "grad_norm": 0.16529197990894318, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6745 + }, + { + "epoch": 5.388178913738019, + "grad_norm": 0.09607496112585068, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6746 + }, + { + "epoch": 5.388977635782748, + "grad_norm": 0.15966713428497314, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6747 + }, + { + "epoch": 5.389776357827476, + "grad_norm": 0.1622796356678009, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6748 + }, + { + "epoch": 5.390575079872204, + "grad_norm": 0.09537432342767715, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6749 + }, + { + "epoch": 5.391373801916933, + "grad_norm": 0.1766965389251709, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6750 + }, + { + "epoch": 5.392172523961661, + "grad_norm": 0.21354711055755615, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6751 + }, + { + "epoch": 5.39297124600639, + "grad_norm": 0.093564473092556, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6752 + }, + { + "epoch": 5.393769968051118, + "grad_norm": 0.14756347239017487, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6753 + }, + { + "epoch": 5.394568690095847, + "grad_norm": 0.10537468641996384, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6754 + }, + { + "epoch": 5.395367412140575, + "grad_norm": 0.15626567602157593, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6755 + }, + { + "epoch": 5.396166134185304, + "grad_norm": 0.16282637417316437, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6756 + }, + { + "epoch": 5.396964856230032, + "grad_norm": 0.0745241791009903, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6757 + }, + { + "epoch": 5.397763578274761, + "grad_norm": 0.1221894845366478, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6758 + }, + { + "epoch": 5.3985623003194885, + "grad_norm": 0.08314131945371628, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6759 + }, + { + "epoch": 5.399361022364217, + "grad_norm": 0.12707264721393585, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6760 + }, + { + "epoch": 5.4001597444089455, + "grad_norm": 0.12036006152629852, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6761 + }, + { + "epoch": 5.400958466453674, + "grad_norm": 0.12769176065921783, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6762 + }, + { + "epoch": 5.401757188498403, + "grad_norm": 0.2201661318540573, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6763 + }, + { + "epoch": 5.402555910543131, + "grad_norm": 0.15013982355594635, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6764 + }, + { + "epoch": 5.40335463258786, + "grad_norm": 0.7714766263961792, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6765 + }, + { + "epoch": 5.404153354632588, + "grad_norm": 0.20359933376312256, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6766 + }, + { + "epoch": 5.404952076677317, + "grad_norm": 0.12684984505176544, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6767 + }, + { + "epoch": 5.405750798722044, + "grad_norm": 0.09804195165634155, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6768 + }, + { + "epoch": 5.406549520766773, + "grad_norm": 0.10416880995035172, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6769 + }, + { + "epoch": 5.407348242811501, + "grad_norm": 0.1509416699409485, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6770 + }, + { + "epoch": 5.40814696485623, + "grad_norm": 0.15458443760871887, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6771 + }, + { + "epoch": 5.4089456869009584, + "grad_norm": 0.08355830609798431, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6772 + }, + { + "epoch": 5.409744408945687, + "grad_norm": 0.1228979080915451, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6773 + }, + { + "epoch": 5.4105431309904155, + "grad_norm": 0.12139632552862167, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6774 + }, + { + "epoch": 5.411341853035144, + "grad_norm": 0.16298502683639526, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6775 + }, + { + "epoch": 5.412140575079873, + "grad_norm": 0.09110788255929947, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6776 + }, + { + "epoch": 5.4129392971246, + "grad_norm": 0.08584781736135483, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6777 + }, + { + "epoch": 5.413738019169329, + "grad_norm": 0.10148828476667404, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6778 + }, + { + "epoch": 5.414536741214057, + "grad_norm": 0.1046212688088417, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6779 + }, + { + "epoch": 5.415335463258786, + "grad_norm": 0.12530827522277832, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6780 + }, + { + "epoch": 5.416134185303514, + "grad_norm": 0.07337464392185211, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6781 + }, + { + "epoch": 5.416932907348243, + "grad_norm": 0.10839185118675232, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6782 + }, + { + "epoch": 5.417731629392971, + "grad_norm": 0.07784926891326904, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6783 + }, + { + "epoch": 5.4185303514377, + "grad_norm": 0.08692190796136856, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6784 + }, + { + "epoch": 5.419329073482428, + "grad_norm": 0.08721921592950821, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6785 + }, + { + "epoch": 5.420127795527157, + "grad_norm": 0.09581280499696732, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6786 + }, + { + "epoch": 5.420926517571885, + "grad_norm": 0.1156916618347168, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6787 + }, + { + "epoch": 5.421725239616613, + "grad_norm": 0.4520327150821686, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6788 + }, + { + "epoch": 5.422523961661342, + "grad_norm": 0.0948205217719078, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6789 + }, + { + "epoch": 5.42332268370607, + "grad_norm": 0.07208927720785141, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6790 + }, + { + "epoch": 5.424121405750799, + "grad_norm": 0.06830724328756332, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6791 + }, + { + "epoch": 5.424920127795527, + "grad_norm": 0.10488666594028473, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6792 + }, + { + "epoch": 5.425718849840256, + "grad_norm": 0.08509235084056854, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6793 + }, + { + "epoch": 5.426517571884984, + "grad_norm": 0.09133832901716232, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6794 + }, + { + "epoch": 5.427316293929713, + "grad_norm": 0.11715687066316605, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6795 + }, + { + "epoch": 5.428115015974441, + "grad_norm": 0.1196032389998436, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6796 + }, + { + "epoch": 5.428913738019169, + "grad_norm": 0.14141549170017242, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6797 + }, + { + "epoch": 5.4297124600638975, + "grad_norm": 0.12866206467151642, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6798 + }, + { + "epoch": 5.430511182108626, + "grad_norm": 0.10802716016769409, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6799 + }, + { + "epoch": 5.431309904153355, + "grad_norm": 0.10947239398956299, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6800 + }, + { + "epoch": 5.432108626198083, + "grad_norm": 0.08339721709489822, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6801 + }, + { + "epoch": 5.432907348242812, + "grad_norm": 0.12407296150922775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6802 + }, + { + "epoch": 5.43370607028754, + "grad_norm": 0.10537894070148468, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6803 + }, + { + "epoch": 5.434504792332269, + "grad_norm": 0.0920059084892273, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6804 + }, + { + "epoch": 5.435303514376997, + "grad_norm": 0.1502516269683838, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6805 + }, + { + "epoch": 5.436102236421725, + "grad_norm": 0.2798864245414734, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 6806 + }, + { + "epoch": 5.436900958466453, + "grad_norm": 0.11037585884332657, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6807 + }, + { + "epoch": 5.437699680511182, + "grad_norm": 0.12594881653785706, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6808 + }, + { + "epoch": 5.43849840255591, + "grad_norm": 0.09976109862327576, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6809 + }, + { + "epoch": 5.439297124600639, + "grad_norm": 0.3285512328147888, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6810 + }, + { + "epoch": 5.4400958466453675, + "grad_norm": 0.49450287222862244, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6811 + }, + { + "epoch": 5.440894568690096, + "grad_norm": 0.06817556917667389, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6812 + }, + { + "epoch": 5.4416932907348246, + "grad_norm": 0.14917057752609253, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6813 + }, + { + "epoch": 5.442492012779553, + "grad_norm": 0.10008134692907333, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6814 + }, + { + "epoch": 5.443290734824281, + "grad_norm": 0.07854767143726349, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6815 + }, + { + "epoch": 5.444089456869009, + "grad_norm": 0.2441248893737793, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6816 + }, + { + "epoch": 5.444888178913738, + "grad_norm": 0.1276157647371292, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6817 + }, + { + "epoch": 5.445686900958466, + "grad_norm": 0.11779431253671646, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6818 + }, + { + "epoch": 5.446485623003195, + "grad_norm": 0.11788108944892883, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6819 + }, + { + "epoch": 5.447284345047923, + "grad_norm": 0.06554995477199554, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6820 + }, + { + "epoch": 5.448083067092652, + "grad_norm": 0.07937108725309372, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6821 + }, + { + "epoch": 5.44888178913738, + "grad_norm": 0.08041426539421082, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6822 + }, + { + "epoch": 5.449680511182109, + "grad_norm": 0.12429161369800568, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6823 + }, + { + "epoch": 5.4504792332268375, + "grad_norm": 0.09993165731430054, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6824 + }, + { + "epoch": 5.451277955271565, + "grad_norm": 0.07077670097351074, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6825 + }, + { + "epoch": 5.452076677316294, + "grad_norm": 0.12163005024194717, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6826 + }, + { + "epoch": 5.452875399361022, + "grad_norm": 0.19080819189548492, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6827 + }, + { + "epoch": 5.453674121405751, + "grad_norm": 0.06450853496789932, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6828 + }, + { + "epoch": 5.454472843450479, + "grad_norm": 0.8893078565597534, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6829 + }, + { + "epoch": 5.455271565495208, + "grad_norm": 0.08225185424089432, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6830 + }, + { + "epoch": 5.456070287539936, + "grad_norm": 0.08631845563650131, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6831 + }, + { + "epoch": 5.456869009584665, + "grad_norm": 0.1858949214220047, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6832 + }, + { + "epoch": 5.457667731629393, + "grad_norm": 0.10997786372900009, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6833 + }, + { + "epoch": 5.458466453674122, + "grad_norm": 0.09691416472196579, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6834 + }, + { + "epoch": 5.4592651757188495, + "grad_norm": 0.12523561716079712, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6835 + }, + { + "epoch": 5.460063897763578, + "grad_norm": 0.10094364732503891, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6836 + }, + { + "epoch": 5.460862619808307, + "grad_norm": 0.06598310172557831, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6837 + }, + { + "epoch": 5.461661341853035, + "grad_norm": 0.10221479833126068, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6838 + }, + { + "epoch": 5.462460063897764, + "grad_norm": 0.6545975804328918, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6839 + }, + { + "epoch": 5.463258785942492, + "grad_norm": 0.12167128920555115, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6840 + }, + { + "epoch": 5.464057507987221, + "grad_norm": 0.10822924226522446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6841 + }, + { + "epoch": 5.464856230031949, + "grad_norm": 0.11905575543642044, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6842 + }, + { + "epoch": 5.465654952076678, + "grad_norm": 0.10276103764772415, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6843 + }, + { + "epoch": 5.466453674121405, + "grad_norm": 0.09087378531694412, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6844 + }, + { + "epoch": 5.467252396166134, + "grad_norm": 0.13117510080337524, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6845 + }, + { + "epoch": 5.468051118210862, + "grad_norm": 0.14824305474758148, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6846 + }, + { + "epoch": 5.468849840255591, + "grad_norm": 0.08553508669137955, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6847 + }, + { + "epoch": 5.4696485623003195, + "grad_norm": 0.12209141999483109, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6848 + }, + { + "epoch": 5.470447284345048, + "grad_norm": 0.1992058902978897, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6849 + }, + { + "epoch": 5.4712460063897765, + "grad_norm": 0.08518865704536438, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6850 + }, + { + "epoch": 5.472044728434505, + "grad_norm": 0.10496464371681213, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6851 + }, + { + "epoch": 5.472843450479234, + "grad_norm": 0.08789866417646408, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6852 + }, + { + "epoch": 5.473642172523961, + "grad_norm": 0.08592598885297775, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6853 + }, + { + "epoch": 5.47444089456869, + "grad_norm": 0.061165813356637955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6854 + }, + { + "epoch": 5.475239616613418, + "grad_norm": 0.06936467438936234, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6855 + }, + { + "epoch": 5.476038338658147, + "grad_norm": 0.20519734919071198, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6856 + }, + { + "epoch": 5.476837060702875, + "grad_norm": 0.087073415517807, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6857 + }, + { + "epoch": 5.477635782747604, + "grad_norm": 0.10153642296791077, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6858 + }, + { + "epoch": 5.478434504792332, + "grad_norm": 0.12416163831949234, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6859 + }, + { + "epoch": 5.479233226837061, + "grad_norm": 0.1047174334526062, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6860 + }, + { + "epoch": 5.4800319488817895, + "grad_norm": 0.13690868020057678, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6861 + }, + { + "epoch": 5.480830670926517, + "grad_norm": 0.15995970368385315, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6862 + }, + { + "epoch": 5.481629392971246, + "grad_norm": 0.08172900229692459, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6863 + }, + { + "epoch": 5.482428115015974, + "grad_norm": 0.10956761986017227, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6864 + }, + { + "epoch": 5.483226837060703, + "grad_norm": 0.12259931862354279, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6865 + }, + { + "epoch": 5.484025559105431, + "grad_norm": 0.08295698463916779, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6866 + }, + { + "epoch": 5.48482428115016, + "grad_norm": 0.10935505479574203, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6867 + }, + { + "epoch": 5.485623003194888, + "grad_norm": 0.12436006963253021, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6868 + }, + { + "epoch": 5.486421725239617, + "grad_norm": 0.08449307829141617, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6869 + }, + { + "epoch": 5.487220447284345, + "grad_norm": 0.10897113382816315, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6870 + }, + { + "epoch": 5.488019169329074, + "grad_norm": 0.06856910139322281, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6871 + }, + { + "epoch": 5.488817891373802, + "grad_norm": 0.07105988264083862, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6872 + }, + { + "epoch": 5.48961661341853, + "grad_norm": 0.08778723329305649, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6873 + }, + { + "epoch": 5.4904153354632586, + "grad_norm": 0.07818275690078735, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6874 + }, + { + "epoch": 5.491214057507987, + "grad_norm": 0.08410139381885529, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6875 + }, + { + "epoch": 5.492012779552716, + "grad_norm": 0.0804608166217804, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6876 + }, + { + "epoch": 5.492811501597444, + "grad_norm": 0.10089578479528427, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6877 + }, + { + "epoch": 5.493610223642173, + "grad_norm": 0.08231056481599808, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6878 + }, + { + "epoch": 5.494408945686901, + "grad_norm": 0.07642059773206711, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6879 + }, + { + "epoch": 5.49520766773163, + "grad_norm": 0.11312755942344666, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6880 + }, + { + "epoch": 5.496006389776358, + "grad_norm": 0.06288543343544006, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6881 + }, + { + "epoch": 5.496805111821086, + "grad_norm": 0.09648934751749039, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6882 + }, + { + "epoch": 5.497603833865814, + "grad_norm": 0.09374719858169556, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6883 + }, + { + "epoch": 5.498402555910543, + "grad_norm": 0.10596928000450134, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6884 + }, + { + "epoch": 5.4992012779552715, + "grad_norm": 0.06540077924728394, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6885 + }, + { + "epoch": 5.5, + "grad_norm": 0.05208199843764305, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6886 + }, + { + "epoch": 5.5007987220447285, + "grad_norm": 0.10762238502502441, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6887 + }, + { + "epoch": 5.501597444089457, + "grad_norm": 0.122553251683712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6888 + }, + { + "epoch": 5.502396166134186, + "grad_norm": 0.07663412392139435, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6889 + }, + { + "epoch": 5.503194888178914, + "grad_norm": 0.09100968390703201, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6890 + }, + { + "epoch": 5.503993610223642, + "grad_norm": 0.24931807816028595, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6891 + }, + { + "epoch": 5.50479233226837, + "grad_norm": 0.07812821120023727, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6892 + }, + { + "epoch": 5.505591054313099, + "grad_norm": 0.04760657623410225, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6893 + }, + { + "epoch": 5.506389776357827, + "grad_norm": 0.08183290809392929, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6894 + }, + { + "epoch": 5.507188498402556, + "grad_norm": 0.09541092067956924, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6895 + }, + { + "epoch": 5.507987220447284, + "grad_norm": 0.04168708249926567, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6896 + }, + { + "epoch": 5.508785942492013, + "grad_norm": 0.07038994133472443, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6897 + }, + { + "epoch": 5.5095846645367414, + "grad_norm": 0.060375142842531204, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6898 + }, + { + "epoch": 5.51038338658147, + "grad_norm": 0.048829223960638046, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6899 + }, + { + "epoch": 5.511182108626198, + "grad_norm": 0.057894766330718994, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6900 + }, + { + "epoch": 5.511980830670926, + "grad_norm": 0.05786101892590523, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6901 + }, + { + "epoch": 5.512779552715655, + "grad_norm": 0.07246953994035721, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6902 + }, + { + "epoch": 5.513578274760383, + "grad_norm": 0.07493462413549423, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6903 + }, + { + "epoch": 5.514376996805112, + "grad_norm": 0.060612600296735764, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6904 + }, + { + "epoch": 5.51517571884984, + "grad_norm": 0.0666302740573883, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6905 + }, + { + "epoch": 5.515974440894569, + "grad_norm": 0.08713024109601974, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6906 + }, + { + "epoch": 5.516773162939297, + "grad_norm": 0.31083860993385315, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6907 + }, + { + "epoch": 5.517571884984026, + "grad_norm": 0.0808933675289154, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6908 + }, + { + "epoch": 5.518370607028754, + "grad_norm": 0.1312016248703003, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6909 + }, + { + "epoch": 5.519169329073483, + "grad_norm": 0.20448890328407288, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6910 + }, + { + "epoch": 5.5199680511182105, + "grad_norm": 0.2519006133079529, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6911 + }, + { + "epoch": 5.520766773162939, + "grad_norm": 0.11359903216362, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6912 + }, + { + "epoch": 5.521565495207668, + "grad_norm": 0.07498760521411896, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6913 + }, + { + "epoch": 5.522364217252396, + "grad_norm": 0.06599561125040054, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6914 + }, + { + "epoch": 5.523162939297125, + "grad_norm": 0.08988697826862335, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6915 + }, + { + "epoch": 5.523961661341853, + "grad_norm": 0.06968241930007935, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6916 + }, + { + "epoch": 5.524760383386582, + "grad_norm": 0.07231415063142776, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6917 + }, + { + "epoch": 5.52555910543131, + "grad_norm": 0.07369428128004074, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6918 + }, + { + "epoch": 5.526357827476039, + "grad_norm": 0.07677069306373596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6919 + }, + { + "epoch": 5.527156549520766, + "grad_norm": 0.07391869276762009, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6920 + }, + { + "epoch": 5.527955271565495, + "grad_norm": 0.05270293354988098, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6921 + }, + { + "epoch": 5.5287539936102235, + "grad_norm": 0.10439106076955795, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6922 + }, + { + "epoch": 5.529552715654952, + "grad_norm": 0.06968904286623001, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6923 + }, + { + "epoch": 5.5303514376996805, + "grad_norm": 0.08401032537221909, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6924 + }, + { + "epoch": 5.531150159744409, + "grad_norm": 0.11993245035409927, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6925 + }, + { + "epoch": 5.531948881789138, + "grad_norm": 0.05857640504837036, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6926 + }, + { + "epoch": 5.532747603833866, + "grad_norm": 0.10513442009687424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6927 + }, + { + "epoch": 5.533546325878595, + "grad_norm": 0.12233056873083115, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6928 + }, + { + "epoch": 5.534345047923322, + "grad_norm": 0.06959997117519379, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6929 + }, + { + "epoch": 5.535143769968051, + "grad_norm": 0.08057182282209396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6930 + }, + { + "epoch": 5.535942492012779, + "grad_norm": 0.09816458821296692, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6931 + }, + { + "epoch": 5.536741214057508, + "grad_norm": 0.055738940834999084, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6932 + }, + { + "epoch": 5.537539936102236, + "grad_norm": 0.0939234122633934, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6933 + }, + { + "epoch": 5.538338658146965, + "grad_norm": 0.12143029272556305, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6934 + }, + { + "epoch": 5.539137380191693, + "grad_norm": 0.08409210294485092, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6935 + }, + { + "epoch": 5.539936102236422, + "grad_norm": 0.10690448433160782, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6936 + }, + { + "epoch": 5.5407348242811505, + "grad_norm": 0.20701836049556732, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6937 + }, + { + "epoch": 5.541533546325878, + "grad_norm": 0.09124163538217545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6938 + }, + { + "epoch": 5.542332268370607, + "grad_norm": 0.08295103162527084, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6939 + }, + { + "epoch": 5.543130990415335, + "grad_norm": 0.1179230809211731, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6940 + }, + { + "epoch": 5.543929712460064, + "grad_norm": 0.12345689535140991, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6941 + }, + { + "epoch": 5.544728434504792, + "grad_norm": 0.052616000175476074, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6942 + }, + { + "epoch": 5.545527156549521, + "grad_norm": 0.07918131351470947, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6943 + }, + { + "epoch": 5.546325878594249, + "grad_norm": 0.04847119748592377, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6944 + }, + { + "epoch": 5.547124600638978, + "grad_norm": 0.06204143166542053, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6945 + }, + { + "epoch": 5.547923322683706, + "grad_norm": 0.07778293639421463, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6946 + }, + { + "epoch": 5.548722044728435, + "grad_norm": 0.05037623643875122, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6947 + }, + { + "epoch": 5.549520766773163, + "grad_norm": 0.09024710208177567, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 6948 + }, + { + "epoch": 5.550319488817891, + "grad_norm": 0.0872211754322052, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6949 + }, + { + "epoch": 5.55111821086262, + "grad_norm": 0.08456625789403915, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6950 + }, + { + "epoch": 5.551916932907348, + "grad_norm": 0.054692018777132034, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6951 + }, + { + "epoch": 5.552715654952077, + "grad_norm": 0.10690787434577942, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6952 + }, + { + "epoch": 5.553514376996805, + "grad_norm": 0.07764400541782379, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6953 + }, + { + "epoch": 5.554313099041534, + "grad_norm": 0.08423051983118057, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6954 + }, + { + "epoch": 5.555111821086262, + "grad_norm": 0.06771727651357651, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6955 + }, + { + "epoch": 5.555910543130991, + "grad_norm": 0.10505887866020203, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6956 + }, + { + "epoch": 5.556709265175719, + "grad_norm": 0.054641906172037125, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6957 + }, + { + "epoch": 5.557507987220447, + "grad_norm": 0.05115118622779846, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6958 + }, + { + "epoch": 5.5583067092651754, + "grad_norm": 0.07177245616912842, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6959 + }, + { + "epoch": 5.559105431309904, + "grad_norm": 0.06642751395702362, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6960 + }, + { + "epoch": 5.5599041533546325, + "grad_norm": 0.08428867161273956, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6961 + }, + { + "epoch": 5.560702875399361, + "grad_norm": 0.044375378638505936, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6962 + }, + { + "epoch": 5.56150159744409, + "grad_norm": 0.06384986639022827, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6963 + }, + { + "epoch": 5.562300319488818, + "grad_norm": 0.052885912358760834, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6964 + }, + { + "epoch": 5.563099041533547, + "grad_norm": 0.05244029313325882, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6965 + }, + { + "epoch": 5.563897763578275, + "grad_norm": 0.1781054139137268, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6966 + }, + { + "epoch": 5.564696485623003, + "grad_norm": 0.8067191243171692, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6967 + }, + { + "epoch": 5.565495207667731, + "grad_norm": 0.0759076327085495, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6968 + }, + { + "epoch": 5.56629392971246, + "grad_norm": 0.0820186585187912, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6969 + }, + { + "epoch": 5.567092651757188, + "grad_norm": 2.901848316192627, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6970 + }, + { + "epoch": 5.567891373801917, + "grad_norm": 0.5663259625434875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6971 + }, + { + "epoch": 5.568690095846645, + "grad_norm": 0.34909728169441223, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6972 + }, + { + "epoch": 5.569488817891374, + "grad_norm": 0.3031843602657318, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6973 + }, + { + "epoch": 5.5702875399361025, + "grad_norm": 0.9258882403373718, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6974 + }, + { + "epoch": 5.571086261980831, + "grad_norm": 0.37162891030311584, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6975 + }, + { + "epoch": 5.571884984025559, + "grad_norm": 0.11269918829202652, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6976 + }, + { + "epoch": 5.572683706070287, + "grad_norm": 0.20953021943569183, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6977 + }, + { + "epoch": 5.573482428115016, + "grad_norm": 0.22324982285499573, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6978 + }, + { + "epoch": 5.574281150159744, + "grad_norm": 0.47017180919647217, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6979 + }, + { + "epoch": 5.575079872204473, + "grad_norm": 0.22266747057437897, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 6980 + }, + { + "epoch": 5.575878594249201, + "grad_norm": 0.1609373688697815, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6981 + }, + { + "epoch": 5.57667731629393, + "grad_norm": 0.17458784580230713, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6982 + }, + { + "epoch": 5.577476038338658, + "grad_norm": 0.17354144155979156, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6983 + }, + { + "epoch": 5.578274760383387, + "grad_norm": 0.10959888994693756, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6984 + }, + { + "epoch": 5.5790734824281145, + "grad_norm": 0.22630754113197327, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6985 + }, + { + "epoch": 5.579872204472844, + "grad_norm": 0.3786774277687073, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6986 + }, + { + "epoch": 5.580670926517572, + "grad_norm": 0.13818539679050446, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 6987 + }, + { + "epoch": 5.5814696485623, + "grad_norm": 0.22202269732952118, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6988 + }, + { + "epoch": 5.582268370607029, + "grad_norm": 0.08324426412582397, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6989 + }, + { + "epoch": 5.583067092651757, + "grad_norm": 0.16399513185024261, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6990 + }, + { + "epoch": 5.583865814696486, + "grad_norm": 0.13956478238105774, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6991 + }, + { + "epoch": 5.584664536741214, + "grad_norm": 0.09159751981496811, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6992 + }, + { + "epoch": 5.585463258785943, + "grad_norm": 0.19404387474060059, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6993 + }, + { + "epoch": 5.586261980830671, + "grad_norm": 0.07866083085536957, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6994 + }, + { + "epoch": 5.5870607028754, + "grad_norm": 0.10653684288263321, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6995 + }, + { + "epoch": 5.587859424920127, + "grad_norm": 0.12254250794649124, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6996 + }, + { + "epoch": 5.588658146964856, + "grad_norm": 0.0665711760520935, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6997 + }, + { + "epoch": 5.5894568690095845, + "grad_norm": 0.1234782338142395, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6998 + }, + { + "epoch": 5.590255591054313, + "grad_norm": 0.10345113277435303, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6999 + }, + { + "epoch": 5.5910543130990416, + "grad_norm": 0.10187766700983047, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7000 + }, + { + "epoch": 5.59185303514377, + "grad_norm": 0.10330864042043686, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7001 + }, + { + "epoch": 5.592651757188499, + "grad_norm": 0.12427254766225815, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7002 + }, + { + "epoch": 5.593450479233227, + "grad_norm": 0.06854265183210373, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7003 + }, + { + "epoch": 5.594249201277956, + "grad_norm": 0.07029487192630768, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7004 + }, + { + "epoch": 5.595047923322683, + "grad_norm": 0.07483061403036118, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7005 + }, + { + "epoch": 5.595846645367412, + "grad_norm": 0.08542168885469437, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7006 + }, + { + "epoch": 5.59664536741214, + "grad_norm": 0.05537399277091026, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7007 + }, + { + "epoch": 5.597444089456869, + "grad_norm": 0.28531956672668457, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7008 + }, + { + "epoch": 5.598242811501597, + "grad_norm": 0.1349600851535797, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7009 + }, + { + "epoch": 5.599041533546326, + "grad_norm": 0.06000711768865585, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7010 + }, + { + "epoch": 5.5998402555910545, + "grad_norm": 0.08139210939407349, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 7011 + }, + { + "epoch": 5.600638977635783, + "grad_norm": 0.08603602647781372, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7012 + }, + { + "epoch": 5.6014376996805115, + "grad_norm": 0.06586270034313202, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7013 + }, + { + "epoch": 5.602236421725239, + "grad_norm": 0.06276310235261917, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7014 + }, + { + "epoch": 5.603035143769968, + "grad_norm": 0.06072620674967766, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7015 + }, + { + "epoch": 5.603833865814696, + "grad_norm": 0.07509211450815201, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7016 + }, + { + "epoch": 5.604632587859425, + "grad_norm": 0.07241938263177872, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7017 + }, + { + "epoch": 5.605431309904153, + "grad_norm": 0.05110672488808632, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7018 + }, + { + "epoch": 5.606230031948882, + "grad_norm": 0.043005820363759995, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7019 + }, + { + "epoch": 5.60702875399361, + "grad_norm": 0.06298743188381195, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7020 + }, + { + "epoch": 5.607827476038339, + "grad_norm": 0.09457913786172867, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7021 + }, + { + "epoch": 5.608626198083067, + "grad_norm": 0.08066218346357346, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7022 + }, + { + "epoch": 5.609424920127795, + "grad_norm": 0.0845603421330452, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7023 + }, + { + "epoch": 5.6102236421725244, + "grad_norm": 0.09121926873922348, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7024 + }, + { + "epoch": 5.611022364217252, + "grad_norm": 0.12013491243124008, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7025 + }, + { + "epoch": 5.611821086261981, + "grad_norm": 0.062171660363674164, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7026 + }, + { + "epoch": 5.612619808306709, + "grad_norm": 0.05688954144716263, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7027 + }, + { + "epoch": 5.613418530351438, + "grad_norm": 0.049224793910980225, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7028 + }, + { + "epoch": 5.614217252396166, + "grad_norm": 0.06337599456310272, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7029 + }, + { + "epoch": 5.615015974440895, + "grad_norm": 0.03602084144949913, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7030 + }, + { + "epoch": 5.615814696485623, + "grad_norm": 0.06257645785808563, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7031 + }, + { + "epoch": 5.616613418530352, + "grad_norm": 0.09524381905794144, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7032 + }, + { + "epoch": 5.61741214057508, + "grad_norm": 0.06262468546628952, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7033 + }, + { + "epoch": 5.618210862619808, + "grad_norm": 0.23001722991466522, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7034 + }, + { + "epoch": 5.6190095846645365, + "grad_norm": 0.06312809139490128, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7035 + }, + { + "epoch": 5.619808306709265, + "grad_norm": 0.055973440408706665, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7036 + }, + { + "epoch": 5.6206070287539935, + "grad_norm": 0.0943455770611763, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7037 + }, + { + "epoch": 5.621405750798722, + "grad_norm": 0.05577901378273964, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7038 + }, + { + "epoch": 5.622204472843451, + "grad_norm": 0.057599395513534546, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7039 + }, + { + "epoch": 5.623003194888179, + "grad_norm": 0.07785748690366745, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7040 + }, + { + "epoch": 5.623801916932908, + "grad_norm": 0.04796557500958443, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7041 + }, + { + "epoch": 5.624600638977636, + "grad_norm": 0.19438667595386505, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7042 + }, + { + "epoch": 5.625399361022364, + "grad_norm": 0.10055433958768845, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7043 + }, + { + "epoch": 5.626198083067092, + "grad_norm": 0.06082126125693321, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7044 + }, + { + "epoch": 5.626996805111821, + "grad_norm": 0.07862866669893265, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7045 + }, + { + "epoch": 5.627795527156549, + "grad_norm": 0.09042234718799591, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7046 + }, + { + "epoch": 5.628594249201278, + "grad_norm": 0.06087128072977066, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7047 + }, + { + "epoch": 5.6293929712460065, + "grad_norm": 0.04091280326247215, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7048 + }, + { + "epoch": 5.630191693290735, + "grad_norm": 0.0625537633895874, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7049 + }, + { + "epoch": 5.6309904153354635, + "grad_norm": 0.04506808891892433, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7050 + }, + { + "epoch": 5.631789137380192, + "grad_norm": 0.0750357061624527, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7051 + }, + { + "epoch": 5.63258785942492, + "grad_norm": 0.06990372389554977, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7052 + }, + { + "epoch": 5.633386581469648, + "grad_norm": 0.05008876323699951, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7053 + }, + { + "epoch": 5.634185303514377, + "grad_norm": 0.07472547143697739, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7054 + }, + { + "epoch": 5.634984025559105, + "grad_norm": 0.04004117101430893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7055 + }, + { + "epoch": 5.635782747603834, + "grad_norm": 0.10103464871644974, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7056 + }, + { + "epoch": 5.636581469648562, + "grad_norm": 0.10850277543067932, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7057 + }, + { + "epoch": 5.637380191693291, + "grad_norm": 0.1109318807721138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7058 + }, + { + "epoch": 5.638178913738019, + "grad_norm": 0.06371457874774933, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7059 + }, + { + "epoch": 5.638977635782748, + "grad_norm": 0.1320749819278717, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7060 + }, + { + "epoch": 5.6397763578274756, + "grad_norm": 0.11957977712154388, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7061 + }, + { + "epoch": 5.640575079872205, + "grad_norm": 0.10327479988336563, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7062 + }, + { + "epoch": 5.641373801916933, + "grad_norm": 0.09731981158256531, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7063 + }, + { + "epoch": 5.642172523961661, + "grad_norm": 0.10276936739683151, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7064 + }, + { + "epoch": 5.64297124600639, + "grad_norm": 0.06973864883184433, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7065 + }, + { + "epoch": 5.643769968051118, + "grad_norm": 0.12020955234766006, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7066 + }, + { + "epoch": 5.644568690095847, + "grad_norm": 0.15950947999954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7067 + }, + { + "epoch": 5.645367412140575, + "grad_norm": 0.08034086227416992, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7068 + }, + { + "epoch": 5.646166134185304, + "grad_norm": 0.11269761621952057, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7069 + }, + { + "epoch": 5.646964856230032, + "grad_norm": 0.1569385826587677, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7070 + }, + { + "epoch": 5.647763578274761, + "grad_norm": 0.09290867298841476, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7071 + }, + { + "epoch": 5.6485623003194885, + "grad_norm": 0.0742817223072052, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7072 + }, + { + "epoch": 5.649361022364217, + "grad_norm": 0.3531377911567688, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7073 + }, + { + "epoch": 5.6501597444089455, + "grad_norm": 0.05365251749753952, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7074 + }, + { + "epoch": 5.650958466453674, + "grad_norm": 0.10185245424509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7075 + }, + { + "epoch": 5.651757188498403, + "grad_norm": 0.08978144079446793, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7076 + }, + { + "epoch": 5.652555910543131, + "grad_norm": 0.06563816964626312, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7077 + }, + { + "epoch": 5.65335463258786, + "grad_norm": 0.11167218536138535, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7078 + }, + { + "epoch": 5.654153354632588, + "grad_norm": 0.10078081488609314, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7079 + }, + { + "epoch": 5.654952076677317, + "grad_norm": 0.04581546410918236, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7080 + }, + { + "epoch": 5.655750798722044, + "grad_norm": 0.04128880053758621, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7081 + }, + { + "epoch": 5.656549520766773, + "grad_norm": 0.0887683555483818, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7082 + }, + { + "epoch": 5.657348242811501, + "grad_norm": 0.06673122197389603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7083 + }, + { + "epoch": 5.65814696485623, + "grad_norm": 0.12348195165395737, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7084 + }, + { + "epoch": 5.6589456869009584, + "grad_norm": 0.04828948527574539, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7085 + }, + { + "epoch": 5.659744408945687, + "grad_norm": 0.09094297885894775, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7086 + }, + { + "epoch": 5.6605431309904155, + "grad_norm": 0.05775933712720871, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7087 + }, + { + "epoch": 5.661341853035144, + "grad_norm": 0.06460239738225937, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7088 + }, + { + "epoch": 5.662140575079873, + "grad_norm": 0.07246532291173935, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7089 + }, + { + "epoch": 5.6629392971246, + "grad_norm": 0.05635413900017738, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7090 + }, + { + "epoch": 5.663738019169329, + "grad_norm": 0.05866781249642372, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7091 + }, + { + "epoch": 5.664536741214057, + "grad_norm": 0.11024738848209381, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7092 + }, + { + "epoch": 5.665335463258786, + "grad_norm": 2.880472421646118, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7093 + }, + { + "epoch": 5.666134185303514, + "grad_norm": 0.147624671459198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7094 + }, + { + "epoch": 5.666932907348243, + "grad_norm": 0.16042540967464447, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7095 + }, + { + "epoch": 5.667731629392971, + "grad_norm": 0.044081881642341614, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7096 + }, + { + "epoch": 5.6685303514377, + "grad_norm": 0.1580066829919815, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7097 + }, + { + "epoch": 5.669329073482428, + "grad_norm": 0.1348607987165451, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7098 + }, + { + "epoch": 5.670127795527156, + "grad_norm": 0.06525023281574249, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7099 + }, + { + "epoch": 5.6709265175718855, + "grad_norm": 0.12954704463481903, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7100 + }, + { + "epoch": 5.671725239616613, + "grad_norm": 0.09241525083780289, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7101 + }, + { + "epoch": 5.672523961661342, + "grad_norm": 0.05581163614988327, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7102 + }, + { + "epoch": 5.67332268370607, + "grad_norm": 0.0864885225892067, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7103 + }, + { + "epoch": 5.674121405750799, + "grad_norm": 0.0783633440732956, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7104 + }, + { + "epoch": 5.674920127795527, + "grad_norm": 2.419416666030884, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7105 + }, + { + "epoch": 5.675718849840256, + "grad_norm": 0.30067741870880127, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7106 + }, + { + "epoch": 5.676517571884984, + "grad_norm": 0.2876960337162018, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 7107 + }, + { + "epoch": 5.677316293929713, + "grad_norm": 0.13828304409980774, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7108 + }, + { + "epoch": 5.678115015974441, + "grad_norm": 0.12691721320152283, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7109 + }, + { + "epoch": 5.678913738019169, + "grad_norm": 0.18356311321258545, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 7110 + }, + { + "epoch": 5.6797124600638975, + "grad_norm": 0.13121426105499268, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7111 + }, + { + "epoch": 5.680511182108626, + "grad_norm": 0.13354304432868958, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7112 + }, + { + "epoch": 5.681309904153355, + "grad_norm": 0.10858450084924698, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7113 + }, + { + "epoch": 5.682108626198083, + "grad_norm": 0.12026678770780563, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 7114 + }, + { + "epoch": 5.682907348242812, + "grad_norm": 0.10297723114490509, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7115 + }, + { + "epoch": 5.68370607028754, + "grad_norm": 0.10481604188680649, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7116 + }, + { + "epoch": 5.684504792332269, + "grad_norm": 0.1389889419078827, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7117 + }, + { + "epoch": 5.685303514376997, + "grad_norm": 0.047913264483213425, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7118 + }, + { + "epoch": 5.686102236421725, + "grad_norm": 0.07504977285861969, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7119 + }, + { + "epoch": 5.686900958466453, + "grad_norm": 0.08858702331781387, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7120 + }, + { + "epoch": 5.687699680511182, + "grad_norm": 0.07746905088424683, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7121 + }, + { + "epoch": 5.68849840255591, + "grad_norm": 0.20370569825172424, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7122 + }, + { + "epoch": 5.689297124600639, + "grad_norm": 0.053284503519535065, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7123 + }, + { + "epoch": 5.6900958466453675, + "grad_norm": 0.08579347282648087, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7124 + }, + { + "epoch": 5.690894568690096, + "grad_norm": 0.11220933496952057, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7125 + }, + { + "epoch": 5.6916932907348246, + "grad_norm": 0.11851351708173752, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 7126 + }, + { + "epoch": 5.692492012779553, + "grad_norm": 0.0839112401008606, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7127 + }, + { + "epoch": 5.693290734824281, + "grad_norm": 0.07717803865671158, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7128 + }, + { + "epoch": 5.694089456869009, + "grad_norm": 0.10219333320856094, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7129 + }, + { + "epoch": 5.694888178913738, + "grad_norm": 0.06746016442775726, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7130 + }, + { + "epoch": 5.695686900958466, + "grad_norm": 0.09630785137414932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7131 + }, + { + "epoch": 5.696485623003195, + "grad_norm": 0.059845466166734695, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7132 + }, + { + "epoch": 5.697284345047923, + "grad_norm": 0.10587267577648163, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7133 + }, + { + "epoch": 5.698083067092652, + "grad_norm": 0.12221334874629974, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7134 + }, + { + "epoch": 5.69888178913738, + "grad_norm": 0.1638030856847763, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7135 + }, + { + "epoch": 5.699680511182109, + "grad_norm": 0.04686988145112991, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7136 + }, + { + "epoch": 5.700479233226837, + "grad_norm": 0.09120972454547882, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7137 + }, + { + "epoch": 5.701277955271565, + "grad_norm": 0.1081257089972496, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7138 + }, + { + "epoch": 5.702076677316294, + "grad_norm": 0.07313218712806702, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7139 + }, + { + "epoch": 5.702875399361022, + "grad_norm": 0.06039511039853096, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7140 + }, + { + "epoch": 5.703674121405751, + "grad_norm": 0.14473693072795868, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7141 + }, + { + "epoch": 5.704472843450479, + "grad_norm": 0.15062592923641205, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7142 + }, + { + "epoch": 5.705271565495208, + "grad_norm": 0.09711029380559921, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7143 + }, + { + "epoch": 5.706070287539936, + "grad_norm": 0.056874651461839676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7144 + }, + { + "epoch": 5.706869009584665, + "grad_norm": 0.1077205091714859, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7145 + }, + { + "epoch": 5.707667731629393, + "grad_norm": 0.1437366008758545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7146 + }, + { + "epoch": 5.708466453674122, + "grad_norm": 0.06206873059272766, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7147 + }, + { + "epoch": 5.7092651757188495, + "grad_norm": 0.06379563361406326, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7148 + }, + { + "epoch": 5.710063897763578, + "grad_norm": 0.11586727946996689, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7149 + }, + { + "epoch": 5.710862619808307, + "grad_norm": 0.12792269885540009, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7150 + }, + { + "epoch": 5.711661341853035, + "grad_norm": 0.08514344692230225, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7151 + }, + { + "epoch": 5.712460063897764, + "grad_norm": 0.045359376817941666, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7152 + }, + { + "epoch": 5.713258785942492, + "grad_norm": 0.13782942295074463, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7153 + }, + { + "epoch": 5.714057507987221, + "grad_norm": 0.1362733691930771, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7154 + }, + { + "epoch": 5.714856230031949, + "grad_norm": 0.11249929666519165, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7155 + }, + { + "epoch": 5.715654952076678, + "grad_norm": 0.07308060675859451, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7156 + }, + { + "epoch": 5.716453674121405, + "grad_norm": 0.08434231579303741, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7157 + }, + { + "epoch": 5.717252396166134, + "grad_norm": 0.0800870731472969, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7158 + }, + { + "epoch": 5.718051118210862, + "grad_norm": 0.09833595156669617, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7159 + }, + { + "epoch": 5.718849840255591, + "grad_norm": 0.06979871541261673, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7160 + }, + { + "epoch": 5.7196485623003195, + "grad_norm": 0.3326590657234192, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7161 + }, + { + "epoch": 5.720447284345048, + "grad_norm": 0.07953538745641708, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7162 + }, + { + "epoch": 5.7212460063897765, + "grad_norm": 0.06084589287638664, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7163 + }, + { + "epoch": 5.722044728434505, + "grad_norm": 0.05060078203678131, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7164 + }, + { + "epoch": 5.722843450479234, + "grad_norm": 0.11765584349632263, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7165 + }, + { + "epoch": 5.723642172523961, + "grad_norm": 0.11147762089967728, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7166 + }, + { + "epoch": 5.72444089456869, + "grad_norm": 0.051353756338357925, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7167 + }, + { + "epoch": 5.725239616613418, + "grad_norm": 0.06255709379911423, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7168 + }, + { + "epoch": 5.726038338658147, + "grad_norm": 0.048915427178144455, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7169 + }, + { + "epoch": 5.726837060702875, + "grad_norm": 0.057233601808547974, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7170 + }, + { + "epoch": 5.727635782747604, + "grad_norm": 0.0828251764178276, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7171 + }, + { + "epoch": 5.728434504792332, + "grad_norm": 0.07387874275445938, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7172 + }, + { + "epoch": 5.729233226837061, + "grad_norm": 0.04857983812689781, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7173 + }, + { + "epoch": 5.7300319488817895, + "grad_norm": 0.07202452421188354, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7174 + }, + { + "epoch": 5.730830670926517, + "grad_norm": 0.4291386306285858, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7175 + }, + { + "epoch": 5.731629392971246, + "grad_norm": 0.07219598442316055, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7176 + }, + { + "epoch": 5.732428115015974, + "grad_norm": 0.07889580726623535, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7177 + }, + { + "epoch": 5.733226837060703, + "grad_norm": 0.1154242753982544, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7178 + }, + { + "epoch": 5.734025559105431, + "grad_norm": 0.1711360067129135, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7179 + }, + { + "epoch": 5.73482428115016, + "grad_norm": 0.15897679328918457, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7180 + }, + { + "epoch": 5.735623003194888, + "grad_norm": 0.056718453764915466, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7181 + }, + { + "epoch": 5.736421725239617, + "grad_norm": 0.10130516439676285, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7182 + }, + { + "epoch": 5.737220447284345, + "grad_norm": 0.10965991020202637, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7183 + }, + { + "epoch": 5.738019169329074, + "grad_norm": 0.043925706297159195, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7184 + }, + { + "epoch": 5.738817891373802, + "grad_norm": 0.16040641069412231, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7185 + }, + { + "epoch": 5.73961661341853, + "grad_norm": 0.545796275138855, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7186 + }, + { + "epoch": 5.7404153354632586, + "grad_norm": 0.12285015732049942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7187 + }, + { + "epoch": 5.741214057507987, + "grad_norm": 0.1241980791091919, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7188 + }, + { + "epoch": 5.742012779552716, + "grad_norm": 0.18415005505084991, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7189 + }, + { + "epoch": 5.742811501597444, + "grad_norm": 0.1455639749765396, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7190 + }, + { + "epoch": 5.743610223642173, + "grad_norm": 0.05731341987848282, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7191 + }, + { + "epoch": 5.744408945686901, + "grad_norm": 0.10810694098472595, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7192 + }, + { + "epoch": 5.74520766773163, + "grad_norm": 0.13279423117637634, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7193 + }, + { + "epoch": 5.746006389776358, + "grad_norm": 0.048075832426548004, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7194 + }, + { + "epoch": 5.746805111821086, + "grad_norm": 0.07276510447263718, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7195 + }, + { + "epoch": 5.747603833865814, + "grad_norm": 0.0666821077466011, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7196 + }, + { + "epoch": 5.748402555910543, + "grad_norm": 0.0950300320982933, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7197 + }, + { + "epoch": 5.7492012779552715, + "grad_norm": 0.07229208946228027, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7198 + }, + { + "epoch": 5.75, + "grad_norm": 0.08129260689020157, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7199 + }, + { + "epoch": 5.7507987220447285, + "grad_norm": 0.08685708791017532, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7200 + }, + { + "epoch": 5.751597444089457, + "grad_norm": 0.048116523772478104, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7201 + }, + { + "epoch": 5.752396166134186, + "grad_norm": 0.08470416814088821, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7202 + }, + { + "epoch": 5.753194888178914, + "grad_norm": 0.09388689696788788, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7203 + }, + { + "epoch": 5.753993610223642, + "grad_norm": 0.07961093634366989, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7204 + }, + { + "epoch": 5.75479233226837, + "grad_norm": 0.05949364975094795, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7205 + }, + { + "epoch": 5.755591054313099, + "grad_norm": 0.10149726271629333, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7206 + }, + { + "epoch": 5.756389776357827, + "grad_norm": 0.30414992570877075, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7207 + }, + { + "epoch": 5.757188498402556, + "grad_norm": 0.06670042872428894, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7208 + }, + { + "epoch": 5.757987220447284, + "grad_norm": 0.061501920223236084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7209 + }, + { + "epoch": 5.758785942492013, + "grad_norm": 0.06627584993839264, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7210 + }, + { + "epoch": 5.7595846645367414, + "grad_norm": 0.1268157660961151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7211 + }, + { + "epoch": 5.76038338658147, + "grad_norm": 0.10253716260194778, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7212 + }, + { + "epoch": 5.761182108626198, + "grad_norm": 0.08384321630001068, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7213 + }, + { + "epoch": 5.761980830670926, + "grad_norm": 0.09078267216682434, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7214 + }, + { + "epoch": 5.762779552715655, + "grad_norm": 0.10487394034862518, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7215 + }, + { + "epoch": 5.763578274760383, + "grad_norm": 0.12192805856466293, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7216 + }, + { + "epoch": 5.764376996805112, + "grad_norm": 0.16597039997577667, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7217 + }, + { + "epoch": 5.76517571884984, + "grad_norm": 0.08498643338680267, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7218 + }, + { + "epoch": 5.765974440894569, + "grad_norm": 0.12794862687587738, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7219 + }, + { + "epoch": 5.766773162939297, + "grad_norm": 0.13595858216285706, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7220 + }, + { + "epoch": 5.767571884984026, + "grad_norm": 0.08182058483362198, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7221 + }, + { + "epoch": 5.768370607028754, + "grad_norm": 0.11747279763221741, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7222 + }, + { + "epoch": 5.769169329073483, + "grad_norm": 0.13400238752365112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7223 + }, + { + "epoch": 5.7699680511182105, + "grad_norm": 0.18527893722057343, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7224 + }, + { + "epoch": 5.770766773162939, + "grad_norm": 0.05130131170153618, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7225 + }, + { + "epoch": 5.771565495207668, + "grad_norm": 0.14139772951602936, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7226 + }, + { + "epoch": 5.772364217252396, + "grad_norm": 0.07901434600353241, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7227 + }, + { + "epoch": 5.773162939297125, + "grad_norm": 0.0642717182636261, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7228 + }, + { + "epoch": 5.773961661341853, + "grad_norm": 0.0693419873714447, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7229 + }, + { + "epoch": 5.774760383386582, + "grad_norm": 0.06490292400121689, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7230 + }, + { + "epoch": 5.77555910543131, + "grad_norm": 0.09405414760112762, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7231 + }, + { + "epoch": 5.776357827476039, + "grad_norm": 0.10439605265855789, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7232 + }, + { + "epoch": 5.777156549520766, + "grad_norm": 0.06811316311359406, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7233 + }, + { + "epoch": 5.777955271565495, + "grad_norm": 0.0707770362496376, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7234 + }, + { + "epoch": 5.7787539936102235, + "grad_norm": 0.08751409500837326, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7235 + }, + { + "epoch": 5.779552715654952, + "grad_norm": 0.09626015275716782, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7236 + }, + { + "epoch": 5.7803514376996805, + "grad_norm": 0.11487453430891037, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7237 + }, + { + "epoch": 5.781150159744409, + "grad_norm": 0.06278856843709946, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7238 + }, + { + "epoch": 5.781948881789138, + "grad_norm": 0.131802499294281, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7239 + }, + { + "epoch": 5.782747603833866, + "grad_norm": 0.09209976345300674, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7240 + }, + { + "epoch": 5.783546325878595, + "grad_norm": 0.06524617224931717, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7241 + }, + { + "epoch": 5.784345047923322, + "grad_norm": 0.10735169053077698, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7242 + }, + { + "epoch": 5.785143769968051, + "grad_norm": 0.08926022797822952, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7243 + }, + { + "epoch": 5.785942492012779, + "grad_norm": 0.08254969120025635, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7244 + }, + { + "epoch": 5.786741214057508, + "grad_norm": 0.07478158175945282, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7245 + }, + { + "epoch": 5.787539936102236, + "grad_norm": 0.0974164679646492, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7246 + }, + { + "epoch": 5.788338658146965, + "grad_norm": 0.05145352706313133, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7247 + }, + { + "epoch": 5.789137380191693, + "grad_norm": 0.11986715346574783, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7248 + }, + { + "epoch": 5.789936102236422, + "grad_norm": 0.12020506709814072, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7249 + }, + { + "epoch": 5.7907348242811505, + "grad_norm": 0.07199704647064209, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7250 + }, + { + "epoch": 5.791533546325878, + "grad_norm": 0.10702182352542877, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7251 + }, + { + "epoch": 5.792332268370607, + "grad_norm": 0.10817115753889084, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7252 + }, + { + "epoch": 5.793130990415335, + "grad_norm": 0.1875494122505188, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7253 + }, + { + "epoch": 5.793929712460064, + "grad_norm": 0.07347052544355392, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7254 + }, + { + "epoch": 5.794728434504792, + "grad_norm": 0.08588847517967224, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7255 + }, + { + "epoch": 5.795527156549521, + "grad_norm": 0.08241020143032074, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7256 + }, + { + "epoch": 5.796325878594249, + "grad_norm": 0.06322775781154633, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7257 + }, + { + "epoch": 5.797124600638978, + "grad_norm": 0.10279159247875214, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7258 + }, + { + "epoch": 5.797923322683706, + "grad_norm": 0.1887427717447281, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7259 + }, + { + "epoch": 5.798722044728435, + "grad_norm": 0.12288179248571396, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7260 + }, + { + "epoch": 5.799520766773163, + "grad_norm": 0.07014663517475128, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7261 + }, + { + "epoch": 5.800319488817891, + "grad_norm": 0.3741980493068695, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7262 + }, + { + "epoch": 5.80111821086262, + "grad_norm": 0.10083315521478653, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7263 + }, + { + "epoch": 5.801916932907348, + "grad_norm": 0.06427261233329773, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7264 + }, + { + "epoch": 5.802715654952077, + "grad_norm": 0.06265366077423096, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7265 + }, + { + "epoch": 5.803514376996805, + "grad_norm": 0.09602728486061096, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7266 + }, + { + "epoch": 5.804313099041534, + "grad_norm": 0.10369620472192764, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7267 + }, + { + "epoch": 5.805111821086262, + "grad_norm": 0.09742012619972229, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7268 + }, + { + "epoch": 5.805910543130991, + "grad_norm": 0.11579136550426483, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7269 + }, + { + "epoch": 5.806709265175719, + "grad_norm": 0.11265771090984344, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7270 + }, + { + "epoch": 5.807507987220447, + "grad_norm": 0.10684274882078171, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7271 + }, + { + "epoch": 5.8083067092651754, + "grad_norm": 0.12550850212574005, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7272 + }, + { + "epoch": 5.809105431309904, + "grad_norm": 0.04966668784618378, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7273 + }, + { + "epoch": 5.8099041533546325, + "grad_norm": 0.26124852895736694, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7274 + }, + { + "epoch": 5.810702875399361, + "grad_norm": 0.12293774634599686, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7275 + }, + { + "epoch": 5.81150159744409, + "grad_norm": 0.11183387041091919, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7276 + }, + { + "epoch": 5.812300319488818, + "grad_norm": 0.08738099783658981, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7277 + }, + { + "epoch": 5.813099041533547, + "grad_norm": 0.06429604440927505, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7278 + }, + { + "epoch": 5.813897763578275, + "grad_norm": 0.09102299064397812, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7279 + }, + { + "epoch": 5.814696485623003, + "grad_norm": 0.06249788776040077, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7280 + }, + { + "epoch": 5.815495207667731, + "grad_norm": 0.08752568066120148, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7281 + }, + { + "epoch": 5.81629392971246, + "grad_norm": 0.06289692968130112, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7282 + }, + { + "epoch": 5.817092651757188, + "grad_norm": 0.1269187480211258, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7283 + }, + { + "epoch": 5.817891373801917, + "grad_norm": 0.0839361846446991, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7284 + }, + { + "epoch": 5.818690095846645, + "grad_norm": 0.0855027437210083, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7285 + }, + { + "epoch": 5.819488817891374, + "grad_norm": 0.20559446513652802, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7286 + }, + { + "epoch": 5.8202875399361025, + "grad_norm": 0.0740990862250328, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7287 + }, + { + "epoch": 5.821086261980831, + "grad_norm": 0.06762924790382385, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7288 + }, + { + "epoch": 5.821884984025559, + "grad_norm": 0.5238296985626221, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7289 + }, + { + "epoch": 5.822683706070287, + "grad_norm": 0.09929470717906952, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7290 + }, + { + "epoch": 5.823482428115016, + "grad_norm": 0.11528550088405609, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7291 + }, + { + "epoch": 5.824281150159744, + "grad_norm": 0.10563576966524124, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 7292 + }, + { + "epoch": 5.825079872204473, + "grad_norm": 0.13924843072891235, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7293 + }, + { + "epoch": 5.825878594249201, + "grad_norm": 0.1332271546125412, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7294 + }, + { + "epoch": 5.82667731629393, + "grad_norm": 0.15709803998470306, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7295 + }, + { + "epoch": 5.827476038338658, + "grad_norm": 0.19638708233833313, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7296 + }, + { + "epoch": 5.828274760383387, + "grad_norm": 0.16845624148845673, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7297 + }, + { + "epoch": 5.8290734824281145, + "grad_norm": 0.15753695368766785, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7298 + }, + { + "epoch": 5.829872204472844, + "grad_norm": 0.04734346270561218, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7299 + }, + { + "epoch": 5.830670926517572, + "grad_norm": 0.48153460025787354, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7300 + }, + { + "epoch": 5.8314696485623, + "grad_norm": 0.09118880331516266, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7301 + }, + { + "epoch": 5.832268370607029, + "grad_norm": 0.10301438719034195, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7302 + }, + { + "epoch": 5.833067092651757, + "grad_norm": 0.12838974595069885, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7303 + }, + { + "epoch": 5.833865814696486, + "grad_norm": 0.1537700593471527, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7304 + }, + { + "epoch": 5.834664536741214, + "grad_norm": 0.08763979375362396, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7305 + }, + { + "epoch": 5.835463258785943, + "grad_norm": 0.2613058388233185, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7306 + }, + { + "epoch": 5.836261980830671, + "grad_norm": 0.13767825067043304, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7307 + }, + { + "epoch": 5.8370607028754, + "grad_norm": 0.14907905459403992, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7308 + }, + { + "epoch": 5.837859424920127, + "grad_norm": 0.3314233124256134, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 7309 + }, + { + "epoch": 5.838658146964856, + "grad_norm": 0.1368636041879654, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7310 + }, + { + "epoch": 5.8394568690095845, + "grad_norm": 0.13423767685890198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7311 + }, + { + "epoch": 5.840255591054313, + "grad_norm": 0.08914478868246078, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7312 + }, + { + "epoch": 5.8410543130990416, + "grad_norm": 0.09363356977701187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7313 + }, + { + "epoch": 5.84185303514377, + "grad_norm": 0.226780965924263, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7314 + }, + { + "epoch": 5.842651757188499, + "grad_norm": 0.09002092480659485, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7315 + }, + { + "epoch": 5.843450479233227, + "grad_norm": 0.06387127935886383, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7316 + }, + { + "epoch": 5.844249201277956, + "grad_norm": 0.1643945276737213, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7317 + }, + { + "epoch": 5.845047923322683, + "grad_norm": 0.13561291992664337, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7318 + }, + { + "epoch": 5.845846645367412, + "grad_norm": 0.14334949851036072, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7319 + }, + { + "epoch": 5.84664536741214, + "grad_norm": 0.13982698321342468, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7320 + }, + { + "epoch": 5.847444089456869, + "grad_norm": 0.10822772979736328, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7321 + }, + { + "epoch": 5.848242811501597, + "grad_norm": 0.07073087245225906, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7322 + }, + { + "epoch": 5.849041533546326, + "grad_norm": 0.09560684859752655, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7323 + }, + { + "epoch": 5.8498402555910545, + "grad_norm": 0.0882779061794281, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7324 + }, + { + "epoch": 5.850638977635783, + "grad_norm": 0.17319771647453308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7325 + }, + { + "epoch": 5.8514376996805115, + "grad_norm": 0.12140306830406189, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7326 + }, + { + "epoch": 5.852236421725239, + "grad_norm": 0.12064560502767563, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7327 + }, + { + "epoch": 5.853035143769968, + "grad_norm": 0.0733642578125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7328 + }, + { + "epoch": 5.853833865814696, + "grad_norm": 0.08563291281461716, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7329 + }, + { + "epoch": 5.854632587859425, + "grad_norm": 0.11337493360042572, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7330 + }, + { + "epoch": 5.855431309904153, + "grad_norm": 0.12164553254842758, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7331 + }, + { + "epoch": 5.856230031948882, + "grad_norm": 0.06406484544277191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7332 + }, + { + "epoch": 5.85702875399361, + "grad_norm": 0.0765780508518219, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7333 + }, + { + "epoch": 5.857827476038339, + "grad_norm": 0.12847815454006195, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7334 + }, + { + "epoch": 5.858626198083067, + "grad_norm": 0.11934550106525421, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7335 + }, + { + "epoch": 5.859424920127795, + "grad_norm": 0.08170188963413239, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7336 + }, + { + "epoch": 5.8602236421725244, + "grad_norm": 0.13636507093906403, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7337 + }, + { + "epoch": 5.861022364217252, + "grad_norm": 0.11030741780996323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7338 + }, + { + "epoch": 5.861821086261981, + "grad_norm": 0.10200777649879456, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7339 + }, + { + "epoch": 5.862619808306709, + "grad_norm": 0.09916897118091583, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7340 + }, + { + "epoch": 5.863418530351438, + "grad_norm": 0.08136509358882904, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7341 + }, + { + "epoch": 5.864217252396166, + "grad_norm": 0.051609545946121216, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7342 + }, + { + "epoch": 5.865015974440895, + "grad_norm": 0.061890844255685806, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7343 + }, + { + "epoch": 5.865814696485623, + "grad_norm": 0.10308966040611267, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7344 + }, + { + "epoch": 5.866613418530352, + "grad_norm": 0.06762709468603134, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7345 + }, + { + "epoch": 5.86741214057508, + "grad_norm": 0.07767036557197571, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7346 + }, + { + "epoch": 5.868210862619808, + "grad_norm": 0.10608458518981934, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7347 + }, + { + "epoch": 5.8690095846645365, + "grad_norm": 0.13812315464019775, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7348 + }, + { + "epoch": 5.869808306709265, + "grad_norm": 0.10485442727804184, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7349 + }, + { + "epoch": 5.8706070287539935, + "grad_norm": 0.08510198444128036, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7350 + }, + { + "epoch": 5.871405750798722, + "grad_norm": 0.17235122621059418, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7351 + }, + { + "epoch": 5.872204472843451, + "grad_norm": 0.057075515389442444, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7352 + }, + { + "epoch": 5.873003194888179, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7353 + }, + { + "epoch": 5.873801916932908, + "grad_norm": 0.1859748661518097, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7354 + }, + { + "epoch": 5.874600638977636, + "grad_norm": 0.2350156307220459, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7355 + }, + { + "epoch": 5.875399361022364, + "grad_norm": 0.11264859884977341, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 7356 + }, + { + "epoch": 5.876198083067092, + "grad_norm": 0.2859210968017578, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7357 + }, + { + "epoch": 5.876996805111821, + "grad_norm": 0.08706829696893692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7358 + }, + { + "epoch": 5.877795527156549, + "grad_norm": 0.0644318088889122, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7359 + }, + { + "epoch": 5.878594249201278, + "grad_norm": 0.10985474288463593, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7360 + }, + { + "epoch": 5.8793929712460065, + "grad_norm": 0.09968867897987366, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7361 + }, + { + "epoch": 5.880191693290735, + "grad_norm": 0.07277355343103409, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7362 + }, + { + "epoch": 5.8809904153354635, + "grad_norm": 0.043085962533950806, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7363 + }, + { + "epoch": 5.881789137380192, + "grad_norm": 0.10392415523529053, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7364 + }, + { + "epoch": 5.88258785942492, + "grad_norm": 0.05523041635751724, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7365 + }, + { + "epoch": 5.883386581469648, + "grad_norm": 0.1754276603460312, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7366 + }, + { + "epoch": 5.884185303514377, + "grad_norm": 0.09561391174793243, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7367 + }, + { + "epoch": 5.884984025559105, + "grad_norm": 0.17572976648807526, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7368 + }, + { + "epoch": 5.885782747603834, + "grad_norm": 0.06476190686225891, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7369 + }, + { + "epoch": 5.886581469648562, + "grad_norm": 0.08763223886489868, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7370 + }, + { + "epoch": 5.887380191693291, + "grad_norm": 0.04419226944446564, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7371 + }, + { + "epoch": 5.888178913738019, + "grad_norm": 0.08707522600889206, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7372 + }, + { + "epoch": 5.888977635782748, + "grad_norm": 0.3117498457431793, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7373 + }, + { + "epoch": 5.8897763578274756, + "grad_norm": 0.04153338074684143, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7374 + }, + { + "epoch": 5.890575079872205, + "grad_norm": 0.10575849562883377, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7375 + }, + { + "epoch": 5.891373801916933, + "grad_norm": 0.07147886604070663, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7376 + }, + { + "epoch": 5.892172523961661, + "grad_norm": 0.05394810438156128, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7377 + }, + { + "epoch": 5.89297124600639, + "grad_norm": 0.15453197062015533, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7378 + }, + { + "epoch": 5.893769968051118, + "grad_norm": 0.19460639357566833, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7379 + }, + { + "epoch": 5.894568690095847, + "grad_norm": 0.13046157360076904, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7380 + }, + { + "epoch": 5.895367412140575, + "grad_norm": 0.09074800461530685, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7381 + }, + { + "epoch": 5.896166134185304, + "grad_norm": 0.09315948188304901, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7382 + }, + { + "epoch": 5.896964856230032, + "grad_norm": 0.0572352297604084, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7383 + }, + { + "epoch": 5.897763578274761, + "grad_norm": 0.09366700798273087, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7384 + }, + { + "epoch": 5.8985623003194885, + "grad_norm": 0.12643125653266907, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7385 + }, + { + "epoch": 5.899361022364217, + "grad_norm": 0.14831441640853882, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7386 + }, + { + "epoch": 5.9001597444089455, + "grad_norm": 0.06892798840999603, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7387 + }, + { + "epoch": 5.900958466453674, + "grad_norm": 0.24058189988136292, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7388 + }, + { + "epoch": 5.901757188498403, + "grad_norm": 0.12589944899082184, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7389 + }, + { + "epoch": 5.902555910543131, + "grad_norm": 0.10197508335113525, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7390 + }, + { + "epoch": 5.90335463258786, + "grad_norm": 0.04367182031273842, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7391 + }, + { + "epoch": 5.904153354632588, + "grad_norm": 0.11131702363491058, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7392 + }, + { + "epoch": 5.904952076677317, + "grad_norm": 0.10258752107620239, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7393 + }, + { + "epoch": 5.905750798722044, + "grad_norm": 0.05077935755252838, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7394 + }, + { + "epoch": 5.906549520766773, + "grad_norm": 0.13514964282512665, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7395 + }, + { + "epoch": 5.907348242811501, + "grad_norm": 0.365681916475296, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7396 + }, + { + "epoch": 5.90814696485623, + "grad_norm": 0.09199032932519913, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7397 + }, + { + "epoch": 5.9089456869009584, + "grad_norm": 0.10341943800449371, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7398 + }, + { + "epoch": 5.909744408945687, + "grad_norm": 0.05396822467446327, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7399 + }, + { + "epoch": 5.9105431309904155, + "grad_norm": 0.06582850217819214, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 7400 + }, + { + "epoch": 5.911341853035144, + "grad_norm": 0.04932714253664017, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7401 + }, + { + "epoch": 5.912140575079873, + "grad_norm": 0.08820181339979172, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7402 + }, + { + "epoch": 5.9129392971246, + "grad_norm": 0.08759067952632904, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7403 + }, + { + "epoch": 5.913738019169329, + "grad_norm": 0.0582246370613575, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7404 + }, + { + "epoch": 5.914536741214057, + "grad_norm": 0.3632248044013977, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7405 + }, + { + "epoch": 5.915335463258786, + "grad_norm": 0.054485730826854706, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7406 + }, + { + "epoch": 5.916134185303514, + "grad_norm": 0.06776587665081024, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7407 + }, + { + "epoch": 5.916932907348243, + "grad_norm": 0.06876091659069061, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7408 + }, + { + "epoch": 5.917731629392971, + "grad_norm": 0.06507224589586258, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7409 + }, + { + "epoch": 5.9185303514377, + "grad_norm": 1.061123013496399, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7410 + }, + { + "epoch": 5.919329073482428, + "grad_norm": 0.2808170020580292, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7411 + }, + { + "epoch": 5.920127795527156, + "grad_norm": 0.2075907289981842, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7412 + }, + { + "epoch": 5.9209265175718855, + "grad_norm": 0.08707362413406372, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7413 + }, + { + "epoch": 5.921725239616613, + "grad_norm": 0.17357248067855835, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 7414 + }, + { + "epoch": 5.922523961661342, + "grad_norm": 0.19713328778743744, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7415 + }, + { + "epoch": 5.92332268370607, + "grad_norm": 0.10456258803606033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7416 + }, + { + "epoch": 5.924121405750799, + "grad_norm": 0.10678638517856598, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7417 + }, + { + "epoch": 5.924920127795527, + "grad_norm": 0.12577000260353088, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7418 + }, + { + "epoch": 5.925718849840256, + "grad_norm": 0.14730660617351532, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7419 + }, + { + "epoch": 5.926517571884984, + "grad_norm": 0.07055118680000305, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7420 + }, + { + "epoch": 5.927316293929713, + "grad_norm": 0.10249259322881699, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7421 + }, + { + "epoch": 5.928115015974441, + "grad_norm": 0.06859050691127777, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7422 + }, + { + "epoch": 5.928913738019169, + "grad_norm": 0.043517664074897766, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7423 + }, + { + "epoch": 5.9297124600638975, + "grad_norm": 0.06680947542190552, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7424 + }, + { + "epoch": 5.930511182108626, + "grad_norm": 0.07522429525852203, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7425 + }, + { + "epoch": 5.931309904153355, + "grad_norm": 0.15828543901443481, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7426 + }, + { + "epoch": 5.932108626198083, + "grad_norm": 0.19134600460529327, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7427 + }, + { + "epoch": 5.932907348242812, + "grad_norm": 0.12455222010612488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7428 + }, + { + "epoch": 5.93370607028754, + "grad_norm": 0.11147905886173248, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7429 + }, + { + "epoch": 5.934504792332269, + "grad_norm": 0.1238674744963646, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7430 + }, + { + "epoch": 5.935303514376997, + "grad_norm": 0.15700307488441467, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7431 + }, + { + "epoch": 5.936102236421725, + "grad_norm": 0.11487080156803131, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7432 + }, + { + "epoch": 5.936900958466453, + "grad_norm": 0.11961077898740768, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7433 + }, + { + "epoch": 5.937699680511182, + "grad_norm": 0.07594173401594162, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7434 + }, + { + "epoch": 5.93849840255591, + "grad_norm": 0.19439400732517242, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7435 + }, + { + "epoch": 5.939297124600639, + "grad_norm": 0.17745599150657654, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7436 + }, + { + "epoch": 5.9400958466453675, + "grad_norm": 0.15732692182064056, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7437 + }, + { + "epoch": 5.940894568690096, + "grad_norm": 0.08824916929006577, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7438 + }, + { + "epoch": 5.9416932907348246, + "grad_norm": 0.12354888767004013, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7439 + }, + { + "epoch": 5.942492012779553, + "grad_norm": 0.10940376669168472, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7440 + }, + { + "epoch": 5.943290734824281, + "grad_norm": 0.05808279290795326, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7441 + }, + { + "epoch": 5.944089456869009, + "grad_norm": 0.19519653916358948, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 7442 + }, + { + "epoch": 5.944888178913738, + "grad_norm": 0.07913058996200562, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7443 + }, + { + "epoch": 5.945686900958466, + "grad_norm": 0.5150377750396729, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7444 + }, + { + "epoch": 5.946485623003195, + "grad_norm": 0.24083790183067322, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7445 + }, + { + "epoch": 5.947284345047923, + "grad_norm": 0.11291394382715225, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7446 + }, + { + "epoch": 5.948083067092652, + "grad_norm": 0.0899023786187172, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7447 + }, + { + "epoch": 5.94888178913738, + "grad_norm": 0.05489958077669144, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7448 + }, + { + "epoch": 5.949680511182109, + "grad_norm": 0.12375161051750183, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7449 + }, + { + "epoch": 5.950479233226837, + "grad_norm": 0.11610512435436249, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7450 + }, + { + "epoch": 5.951277955271565, + "grad_norm": 0.06953240931034088, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7451 + }, + { + "epoch": 5.952076677316294, + "grad_norm": 0.09784717857837677, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7452 + }, + { + "epoch": 5.952875399361022, + "grad_norm": 0.059533409774303436, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7453 + }, + { + "epoch": 5.953674121405751, + "grad_norm": 0.06361017376184464, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7454 + }, + { + "epoch": 5.954472843450479, + "grad_norm": 0.33739587664604187, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7455 + }, + { + "epoch": 5.955271565495208, + "grad_norm": 0.0726039931178093, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7456 + }, + { + "epoch": 5.956070287539936, + "grad_norm": 0.047813788056373596, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7457 + }, + { + "epoch": 5.956869009584665, + "grad_norm": 0.05501490831375122, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7458 + }, + { + "epoch": 5.957667731629393, + "grad_norm": 0.24806374311447144, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7459 + }, + { + "epoch": 5.958466453674122, + "grad_norm": 0.09020408987998962, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7460 + }, + { + "epoch": 5.9592651757188495, + "grad_norm": 0.09845588356256485, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7461 + }, + { + "epoch": 5.960063897763578, + "grad_norm": 0.2733388841152191, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7462 + }, + { + "epoch": 5.960862619808307, + "grad_norm": 0.04368302598595619, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7463 + }, + { + "epoch": 5.961661341853035, + "grad_norm": 0.06559797376394272, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7464 + }, + { + "epoch": 5.962460063897764, + "grad_norm": 0.08194267004728317, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7465 + }, + { + "epoch": 5.963258785942492, + "grad_norm": 0.08440488576889038, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7466 + }, + { + "epoch": 5.964057507987221, + "grad_norm": 0.07046753168106079, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7467 + }, + { + "epoch": 5.964856230031949, + "grad_norm": 0.061910174787044525, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7468 + }, + { + "epoch": 5.965654952076678, + "grad_norm": 0.06781110167503357, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7469 + }, + { + "epoch": 5.966453674121405, + "grad_norm": 0.0626576617360115, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7470 + }, + { + "epoch": 5.967252396166134, + "grad_norm": 0.05339542031288147, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7471 + }, + { + "epoch": 5.968051118210862, + "grad_norm": 0.09167633950710297, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7472 + }, + { + "epoch": 5.968849840255591, + "grad_norm": 0.07272132486104965, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7473 + }, + { + "epoch": 5.9696485623003195, + "grad_norm": 0.1218709796667099, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7474 + }, + { + "epoch": 5.970447284345048, + "grad_norm": 0.21024082601070404, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7475 + }, + { + "epoch": 5.9712460063897765, + "grad_norm": 0.08869504183530807, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7476 + }, + { + "epoch": 5.972044728434505, + "grad_norm": 0.05930836871266365, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7477 + }, + { + "epoch": 5.972843450479234, + "grad_norm": 0.10009569674730301, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7478 + }, + { + "epoch": 5.973642172523961, + "grad_norm": 0.2543089687824249, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7479 + }, + { + "epoch": 5.97444089456869, + "grad_norm": 0.04702993854880333, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7480 + }, + { + "epoch": 5.975239616613418, + "grad_norm": 0.12841154634952545, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7481 + }, + { + "epoch": 5.976038338658147, + "grad_norm": 0.10137920081615448, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7482 + }, + { + "epoch": 5.976837060702875, + "grad_norm": 0.0582512766122818, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7483 + }, + { + "epoch": 5.977635782747604, + "grad_norm": 0.06556501984596252, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7484 + }, + { + "epoch": 5.978434504792332, + "grad_norm": 0.2065235674381256, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7485 + }, + { + "epoch": 5.979233226837061, + "grad_norm": 0.07943716645240784, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7486 + }, + { + "epoch": 5.9800319488817895, + "grad_norm": 0.05257594957947731, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7487 + }, + { + "epoch": 5.980830670926517, + "grad_norm": 0.06949680298566818, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7488 + }, + { + "epoch": 5.981629392971246, + "grad_norm": 0.0967894196510315, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7489 + }, + { + "epoch": 5.982428115015974, + "grad_norm": 1.068231463432312, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7490 + }, + { + "epoch": 5.983226837060703, + "grad_norm": 0.0648348405957222, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7491 + }, + { + "epoch": 5.984025559105431, + "grad_norm": 0.2540450096130371, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7492 + }, + { + "epoch": 5.98482428115016, + "grad_norm": 0.1624346375465393, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7493 + }, + { + "epoch": 5.985623003194888, + "grad_norm": 0.10054703056812286, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7494 + }, + { + "epoch": 5.986421725239617, + "grad_norm": 0.05147058889269829, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7495 + }, + { + "epoch": 5.987220447284345, + "grad_norm": 0.10036633163690567, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7496 + }, + { + "epoch": 5.988019169329074, + "grad_norm": 0.14611777663230896, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7497 + }, + { + "epoch": 5.988817891373802, + "grad_norm": 0.12323570251464844, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7498 + }, + { + "epoch": 5.98961661341853, + "grad_norm": 0.04539888724684715, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7499 + }, + { + "epoch": 5.9904153354632586, + "grad_norm": 0.14555387198925018, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7500 + }, + { + "epoch": 5.991214057507987, + "grad_norm": 0.3205990195274353, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7501 + }, + { + "epoch": 5.992012779552716, + "grad_norm": 0.22900770604610443, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7502 + }, + { + "epoch": 5.992811501597444, + "grad_norm": 0.11138728260993958, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7503 + }, + { + "epoch": 5.993610223642173, + "grad_norm": 0.09425637125968933, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7504 + }, + { + "epoch": 5.994408945686901, + "grad_norm": 0.18409870564937592, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7505 + }, + { + "epoch": 5.99520766773163, + "grad_norm": 0.1610010713338852, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7506 + }, + { + "epoch": 5.996006389776358, + "grad_norm": 0.2304852306842804, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7507 + }, + { + "epoch": 5.996805111821086, + "grad_norm": 0.09830645471811295, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7508 + }, + { + "epoch": 5.997603833865814, + "grad_norm": 0.12319398671388626, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7509 + }, + { + "epoch": 5.998402555910543, + "grad_norm": 0.07925699651241302, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7510 + }, + { + "epoch": 5.9992012779552715, + "grad_norm": 0.07079242914915085, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7511 + }, + { + "epoch": 6.0, + "grad_norm": 0.14047275483608246, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7512 + }, + { + "epoch": 6.0007987220447285, + "grad_norm": 0.172583669424057, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7513 + }, + { + "epoch": 6.001597444089457, + "grad_norm": 0.3635086119174957, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7514 + }, + { + "epoch": 6.002396166134186, + "grad_norm": 0.14463695883750916, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7515 + }, + { + "epoch": 6.003194888178914, + "grad_norm": 0.24417585134506226, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7516 + }, + { + "epoch": 6.003993610223642, + "grad_norm": 0.25690382719039917, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7517 + }, + { + "epoch": 6.00479233226837, + "grad_norm": 0.12535394728183746, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 7518 + }, + { + "epoch": 6.005591054313099, + "grad_norm": 0.19279715418815613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7519 + }, + { + "epoch": 6.006389776357827, + "grad_norm": 0.10537917166948318, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7520 + }, + { + "epoch": 6.007188498402556, + "grad_norm": 0.07752633094787598, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7521 + }, + { + "epoch": 6.007987220447284, + "grad_norm": 0.10693971067667007, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7522 + }, + { + "epoch": 6.008785942492013, + "grad_norm": 0.06399057805538177, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7523 + }, + { + "epoch": 6.0095846645367414, + "grad_norm": 0.12577609717845917, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7524 + }, + { + "epoch": 6.01038338658147, + "grad_norm": 0.12770701944828033, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7525 + }, + { + "epoch": 6.0111821086261985, + "grad_norm": 0.07679085433483124, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7526 + }, + { + "epoch": 6.011980830670926, + "grad_norm": 0.14353524148464203, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7527 + }, + { + "epoch": 6.012779552715655, + "grad_norm": 0.3428184688091278, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7528 + }, + { + "epoch": 6.013578274760383, + "grad_norm": 0.1436242014169693, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 7529 + }, + { + "epoch": 6.014376996805112, + "grad_norm": 0.07608507573604584, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7530 + }, + { + "epoch": 6.01517571884984, + "grad_norm": 0.10932086408138275, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7531 + }, + { + "epoch": 6.015974440894569, + "grad_norm": 0.07631878554821014, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7532 + }, + { + "epoch": 6.016773162939297, + "grad_norm": 0.0718175396323204, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7533 + }, + { + "epoch": 6.017571884984026, + "grad_norm": 0.07661164551973343, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7534 + }, + { + "epoch": 6.018370607028754, + "grad_norm": 0.10753245651721954, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7535 + }, + { + "epoch": 6.019169329073482, + "grad_norm": 0.12740729749202728, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7536 + }, + { + "epoch": 6.0199680511182105, + "grad_norm": 0.14345388114452362, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7537 + }, + { + "epoch": 6.020766773162939, + "grad_norm": 0.13860031962394714, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7538 + }, + { + "epoch": 6.021565495207668, + "grad_norm": 0.07766555994749069, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7539 + }, + { + "epoch": 6.022364217252396, + "grad_norm": 0.11253347247838974, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7540 + }, + { + "epoch": 6.023162939297125, + "grad_norm": 0.18870452046394348, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7541 + }, + { + "epoch": 6.023961661341853, + "grad_norm": 0.12401654571294785, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7542 + }, + { + "epoch": 6.024760383386582, + "grad_norm": 0.08025321364402771, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7543 + }, + { + "epoch": 6.02555910543131, + "grad_norm": 0.12504157423973083, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7544 + }, + { + "epoch": 6.026357827476039, + "grad_norm": 0.07099851220846176, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7545 + }, + { + "epoch": 6.027156549520766, + "grad_norm": 0.09573683142662048, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7546 + }, + { + "epoch": 6.027955271565495, + "grad_norm": 0.18280553817749023, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7547 + }, + { + "epoch": 6.0287539936102235, + "grad_norm": 0.15688058733940125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7548 + }, + { + "epoch": 6.029552715654952, + "grad_norm": 0.11738436669111252, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7549 + }, + { + "epoch": 6.0303514376996805, + "grad_norm": 1.275103211402893, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7550 + }, + { + "epoch": 6.031150159744409, + "grad_norm": 0.39542102813720703, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7551 + }, + { + "epoch": 6.031948881789138, + "grad_norm": 0.32140371203422546, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7552 + }, + { + "epoch": 6.032747603833866, + "grad_norm": 0.2855371832847595, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7553 + }, + { + "epoch": 6.033546325878595, + "grad_norm": 0.14987513422966003, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7554 + }, + { + "epoch": 6.034345047923322, + "grad_norm": 0.25978198647499084, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7555 + }, + { + "epoch": 6.035143769968051, + "grad_norm": 0.14043942093849182, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7556 + }, + { + "epoch": 6.035942492012779, + "grad_norm": 0.16670344769954681, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7557 + }, + { + "epoch": 6.036741214057508, + "grad_norm": 0.1668681800365448, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7558 + }, + { + "epoch": 6.037539936102236, + "grad_norm": 0.11135906726121902, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7559 + }, + { + "epoch": 6.038338658146965, + "grad_norm": 0.26222026348114014, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7560 + }, + { + "epoch": 6.039137380191693, + "grad_norm": 0.1670113205909729, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7561 + }, + { + "epoch": 6.039936102236422, + "grad_norm": 0.15860766172409058, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7562 + }, + { + "epoch": 6.0407348242811505, + "grad_norm": 0.2577793300151825, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7563 + }, + { + "epoch": 6.041533546325879, + "grad_norm": 0.11147591471672058, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7564 + }, + { + "epoch": 6.042332268370607, + "grad_norm": 0.18452385067939758, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7565 + }, + { + "epoch": 6.043130990415335, + "grad_norm": 0.19697625935077667, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7566 + }, + { + "epoch": 6.043929712460064, + "grad_norm": 0.08586452901363373, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7567 + }, + { + "epoch": 6.044728434504792, + "grad_norm": 0.18721693754196167, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7568 + }, + { + "epoch": 6.045527156549521, + "grad_norm": 0.13190758228302002, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7569 + }, + { + "epoch": 6.046325878594249, + "grad_norm": 0.09424075484275818, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7570 + }, + { + "epoch": 6.047124600638978, + "grad_norm": 0.15252210199832916, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7571 + }, + { + "epoch": 6.047923322683706, + "grad_norm": 0.06378420442342758, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7572 + }, + { + "epoch": 6.048722044728435, + "grad_norm": 0.07665325701236725, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7573 + }, + { + "epoch": 6.0495207667731625, + "grad_norm": 0.0847245529294014, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7574 + }, + { + "epoch": 6.050319488817891, + "grad_norm": 0.034070566296577454, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7575 + }, + { + "epoch": 6.05111821086262, + "grad_norm": 0.08149915188550949, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7576 + }, + { + "epoch": 6.051916932907348, + "grad_norm": 0.07882412523031235, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7577 + }, + { + "epoch": 6.052715654952077, + "grad_norm": 0.055492956191301346, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7578 + }, + { + "epoch": 6.053514376996805, + "grad_norm": 0.10246025770902634, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7579 + }, + { + "epoch": 6.054313099041534, + "grad_norm": 0.11067861318588257, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7580 + }, + { + "epoch": 6.055111821086262, + "grad_norm": 0.06063758581876755, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7581 + }, + { + "epoch": 6.055910543130991, + "grad_norm": 0.06848330795764923, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 7582 + }, + { + "epoch": 6.056709265175719, + "grad_norm": 0.10336993634700775, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7583 + }, + { + "epoch": 6.057507987220447, + "grad_norm": 0.06081530824303627, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7584 + }, + { + "epoch": 6.0583067092651754, + "grad_norm": 0.08049804717302322, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7585 + }, + { + "epoch": 6.059105431309904, + "grad_norm": 0.09174875915050507, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7586 + }, + { + "epoch": 6.0599041533546325, + "grad_norm": 0.06121581420302391, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7587 + }, + { + "epoch": 6.060702875399361, + "grad_norm": 0.10653077065944672, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7588 + }, + { + "epoch": 6.06150159744409, + "grad_norm": 0.0676097571849823, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7589 + }, + { + "epoch": 6.062300319488818, + "grad_norm": 0.0625678300857544, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7590 + }, + { + "epoch": 6.063099041533547, + "grad_norm": 0.07936695963144302, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7591 + }, + { + "epoch": 6.063897763578275, + "grad_norm": 0.06149541214108467, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7592 + }, + { + "epoch": 6.064696485623003, + "grad_norm": 0.04549092426896095, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7593 + }, + { + "epoch": 6.065495207667731, + "grad_norm": 0.06483953446149826, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7594 + }, + { + "epoch": 6.06629392971246, + "grad_norm": 0.04048188030719757, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7595 + }, + { + "epoch": 6.067092651757188, + "grad_norm": 0.038281429558992386, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7596 + }, + { + "epoch": 6.067891373801917, + "grad_norm": 0.06686673313379288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7597 + }, + { + "epoch": 6.068690095846645, + "grad_norm": 0.09025852382183075, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7598 + }, + { + "epoch": 6.069488817891374, + "grad_norm": 0.07517793774604797, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7599 + }, + { + "epoch": 6.0702875399361025, + "grad_norm": 0.06342573463916779, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7600 + }, + { + "epoch": 6.071086261980831, + "grad_norm": 0.08630760759115219, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7601 + }, + { + "epoch": 6.0718849840255595, + "grad_norm": 0.06443625688552856, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7602 + }, + { + "epoch": 6.072683706070287, + "grad_norm": 0.08748311549425125, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7603 + }, + { + "epoch": 6.073482428115016, + "grad_norm": 0.051623452454805374, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7604 + }, + { + "epoch": 6.074281150159744, + "grad_norm": 0.09098891913890839, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7605 + }, + { + "epoch": 6.075079872204473, + "grad_norm": 0.14741428196430206, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7606 + }, + { + "epoch": 6.075878594249201, + "grad_norm": 0.064545176923275, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7607 + }, + { + "epoch": 6.07667731629393, + "grad_norm": 0.09775100648403168, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7608 + }, + { + "epoch": 6.077476038338658, + "grad_norm": 0.14192643761634827, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7609 + }, + { + "epoch": 6.078274760383387, + "grad_norm": 0.05390379950404167, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7610 + }, + { + "epoch": 6.079073482428115, + "grad_norm": 0.35628536343574524, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7611 + }, + { + "epoch": 6.079872204472843, + "grad_norm": 0.11727920919656754, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7612 + }, + { + "epoch": 6.080670926517572, + "grad_norm": 0.053165338933467865, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7613 + }, + { + "epoch": 6.0814696485623, + "grad_norm": 0.12718519568443298, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7614 + }, + { + "epoch": 6.082268370607029, + "grad_norm": 0.12406741827726364, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7615 + }, + { + "epoch": 6.083067092651757, + "grad_norm": 0.05323740839958191, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7616 + }, + { + "epoch": 6.083865814696486, + "grad_norm": 0.09811960160732269, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7617 + }, + { + "epoch": 6.084664536741214, + "grad_norm": 0.12453506886959076, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7618 + }, + { + "epoch": 6.085463258785943, + "grad_norm": 0.13459496200084686, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7619 + }, + { + "epoch": 6.086261980830671, + "grad_norm": 0.20130378007888794, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7620 + }, + { + "epoch": 6.0870607028754, + "grad_norm": 0.11361974477767944, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7621 + }, + { + "epoch": 6.087859424920127, + "grad_norm": 0.07432135194540024, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7622 + }, + { + "epoch": 6.088658146964856, + "grad_norm": 0.14522314071655273, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7623 + }, + { + "epoch": 6.0894568690095845, + "grad_norm": 0.050937261432409286, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7624 + }, + { + "epoch": 6.090255591054313, + "grad_norm": 0.12386021763086319, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7625 + }, + { + "epoch": 6.0910543130990416, + "grad_norm": 0.1498231738805771, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7626 + }, + { + "epoch": 6.09185303514377, + "grad_norm": 0.042041294276714325, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7627 + }, + { + "epoch": 6.092651757188499, + "grad_norm": 0.1103961393237114, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7628 + }, + { + "epoch": 6.093450479233227, + "grad_norm": 0.12362606078386307, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7629 + }, + { + "epoch": 6.094249201277956, + "grad_norm": 0.07069346308708191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7630 + }, + { + "epoch": 6.095047923322683, + "grad_norm": 0.1306593418121338, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7631 + }, + { + "epoch": 6.095846645367412, + "grad_norm": 0.11293961852788925, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7632 + }, + { + "epoch": 6.09664536741214, + "grad_norm": 0.07145176827907562, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7633 + }, + { + "epoch": 6.097444089456869, + "grad_norm": 0.11122562736272812, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7634 + }, + { + "epoch": 6.098242811501597, + "grad_norm": 0.039713576436042786, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7635 + }, + { + "epoch": 6.099041533546326, + "grad_norm": 0.11573004722595215, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7636 + }, + { + "epoch": 6.0998402555910545, + "grad_norm": 0.11995833367109299, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7637 + }, + { + "epoch": 6.100638977635783, + "grad_norm": 0.03895663470029831, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7638 + }, + { + "epoch": 6.1014376996805115, + "grad_norm": 0.11274216324090958, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7639 + }, + { + "epoch": 6.102236421725239, + "grad_norm": 0.14242613315582275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7640 + }, + { + "epoch": 6.103035143769968, + "grad_norm": 0.04954848438501358, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7641 + }, + { + "epoch": 6.103833865814696, + "grad_norm": 0.10814809799194336, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7642 + }, + { + "epoch": 6.104632587859425, + "grad_norm": 0.11696363240480423, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7643 + }, + { + "epoch": 6.105431309904153, + "grad_norm": 0.04597959294915199, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7644 + }, + { + "epoch": 6.106230031948882, + "grad_norm": 0.16304457187652588, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7645 + }, + { + "epoch": 6.10702875399361, + "grad_norm": 0.14835208654403687, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7646 + }, + { + "epoch": 6.107827476038339, + "grad_norm": 0.06062949076294899, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7647 + }, + { + "epoch": 6.108626198083067, + "grad_norm": 0.1033453568816185, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7648 + }, + { + "epoch": 6.109424920127796, + "grad_norm": 0.14823280274868011, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7649 + }, + { + "epoch": 6.110223642172524, + "grad_norm": 0.18282924592494965, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7650 + }, + { + "epoch": 6.111022364217252, + "grad_norm": 0.17962203919887543, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7651 + }, + { + "epoch": 6.111821086261981, + "grad_norm": 0.12176015228033066, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7652 + }, + { + "epoch": 6.112619808306709, + "grad_norm": 0.07326921075582504, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7653 + }, + { + "epoch": 6.113418530351438, + "grad_norm": 0.24457645416259766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7654 + }, + { + "epoch": 6.114217252396166, + "grad_norm": 0.1442916989326477, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7655 + }, + { + "epoch": 6.115015974440895, + "grad_norm": 0.0716436356306076, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7656 + }, + { + "epoch": 6.115814696485623, + "grad_norm": 0.20782648026943207, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7657 + }, + { + "epoch": 6.116613418530352, + "grad_norm": 0.1183728352189064, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7658 + }, + { + "epoch": 6.11741214057508, + "grad_norm": 0.13251493871212006, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7659 + }, + { + "epoch": 6.118210862619808, + "grad_norm": 0.21223802864551544, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7660 + }, + { + "epoch": 6.1190095846645365, + "grad_norm": 0.0811460018157959, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7661 + }, + { + "epoch": 6.119808306709265, + "grad_norm": 0.13528718054294586, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7662 + }, + { + "epoch": 6.1206070287539935, + "grad_norm": 0.11806038022041321, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7663 + }, + { + "epoch": 6.121405750798722, + "grad_norm": 0.10022544860839844, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7664 + }, + { + "epoch": 6.122204472843451, + "grad_norm": 0.21452540159225464, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7665 + }, + { + "epoch": 6.123003194888179, + "grad_norm": 0.11949847638607025, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7666 + }, + { + "epoch": 6.123801916932908, + "grad_norm": 0.12636634707450867, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7667 + }, + { + "epoch": 6.124600638977636, + "grad_norm": 0.17132572829723358, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7668 + }, + { + "epoch": 6.125399361022364, + "grad_norm": 0.1116800457239151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7669 + }, + { + "epoch": 6.126198083067092, + "grad_norm": 0.13965120911598206, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7670 + }, + { + "epoch": 6.126996805111821, + "grad_norm": 0.1346610188484192, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7671 + }, + { + "epoch": 6.127795527156549, + "grad_norm": 0.07977228611707687, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7672 + }, + { + "epoch": 6.128594249201278, + "grad_norm": 0.21412506699562073, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7673 + }, + { + "epoch": 6.1293929712460065, + "grad_norm": 0.172305628657341, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7674 + }, + { + "epoch": 6.130191693290735, + "grad_norm": 0.10782980173826218, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7675 + }, + { + "epoch": 6.1309904153354635, + "grad_norm": 0.23166432976722717, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7676 + }, + { + "epoch": 6.131789137380192, + "grad_norm": 0.12337028980255127, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7677 + }, + { + "epoch": 6.13258785942492, + "grad_norm": 0.11406251043081284, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7678 + }, + { + "epoch": 6.133386581469648, + "grad_norm": 0.19163282215595245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7679 + }, + { + "epoch": 6.134185303514377, + "grad_norm": 0.06671248376369476, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7680 + }, + { + "epoch": 6.134984025559105, + "grad_norm": 0.13190557062625885, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7681 + }, + { + "epoch": 6.135782747603834, + "grad_norm": 0.20761321485042572, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7682 + }, + { + "epoch": 6.136581469648562, + "grad_norm": 0.08118047565221786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7683 + }, + { + "epoch": 6.137380191693291, + "grad_norm": 0.1458984613418579, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7684 + }, + { + "epoch": 6.138178913738019, + "grad_norm": 0.1305929571390152, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7685 + }, + { + "epoch": 6.138977635782748, + "grad_norm": 0.0972108244895935, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7686 + }, + { + "epoch": 6.139776357827476, + "grad_norm": 0.14246216416358948, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7687 + }, + { + "epoch": 6.140575079872204, + "grad_norm": 0.04341820999979973, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7688 + }, + { + "epoch": 6.141373801916933, + "grad_norm": 0.127020001411438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7689 + }, + { + "epoch": 6.142172523961661, + "grad_norm": 0.08494339138269424, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7690 + }, + { + "epoch": 6.14297124600639, + "grad_norm": 0.11377454549074173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7691 + }, + { + "epoch": 6.143769968051118, + "grad_norm": 0.13752779364585876, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7692 + }, + { + "epoch": 6.144568690095847, + "grad_norm": 0.054878801107406616, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7693 + }, + { + "epoch": 6.145367412140575, + "grad_norm": 0.11313790827989578, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7694 + }, + { + "epoch": 6.146166134185304, + "grad_norm": 0.04388728365302086, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7695 + }, + { + "epoch": 6.146964856230032, + "grad_norm": 0.12842994928359985, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7696 + }, + { + "epoch": 6.147763578274761, + "grad_norm": 0.1374971568584442, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7697 + }, + { + "epoch": 6.1485623003194885, + "grad_norm": 0.1082429438829422, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7698 + }, + { + "epoch": 6.149361022364217, + "grad_norm": 0.14329178631305695, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7699 + }, + { + "epoch": 6.1501597444089455, + "grad_norm": 0.07794678211212158, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7700 + }, + { + "epoch": 6.150958466453674, + "grad_norm": 0.10680928826332092, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7701 + }, + { + "epoch": 6.151757188498403, + "grad_norm": 0.11628691852092743, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7702 + }, + { + "epoch": 6.152555910543131, + "grad_norm": 0.03565143793821335, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7703 + }, + { + "epoch": 6.15335463258786, + "grad_norm": 0.10634133219718933, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7704 + }, + { + "epoch": 6.154153354632588, + "grad_norm": 0.10307054221630096, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7705 + }, + { + "epoch": 6.154952076677317, + "grad_norm": 0.05591967701911926, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7706 + }, + { + "epoch": 6.155750798722044, + "grad_norm": 0.07205721735954285, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7707 + }, + { + "epoch": 6.156549520766773, + "grad_norm": 0.05020968243479729, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7708 + }, + { + "epoch": 6.157348242811501, + "grad_norm": 0.037087470293045044, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7709 + }, + { + "epoch": 6.15814696485623, + "grad_norm": 0.06322529166936874, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7710 + }, + { + "epoch": 6.1589456869009584, + "grad_norm": 0.03881093114614487, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7711 + }, + { + "epoch": 6.159744408945687, + "grad_norm": 0.06219052895903587, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7712 + }, + { + "epoch": 6.1605431309904155, + "grad_norm": 0.043313659727573395, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7713 + }, + { + "epoch": 6.161341853035144, + "grad_norm": 0.05460439994931221, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7714 + }, + { + "epoch": 6.162140575079873, + "grad_norm": 0.045017000287771225, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7715 + }, + { + "epoch": 6.1629392971246, + "grad_norm": 0.08029863983392715, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7716 + }, + { + "epoch": 6.163738019169329, + "grad_norm": 0.06935936212539673, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7717 + }, + { + "epoch": 6.164536741214057, + "grad_norm": 0.12617695331573486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7718 + }, + { + "epoch": 6.165335463258786, + "grad_norm": 0.09746283292770386, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7719 + }, + { + "epoch": 6.166134185303514, + "grad_norm": 0.038731649518013, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7720 + }, + { + "epoch": 6.166932907348243, + "grad_norm": 0.1054256334900856, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7721 + }, + { + "epoch": 6.167731629392971, + "grad_norm": 0.0833977535367012, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7722 + }, + { + "epoch": 6.1685303514377, + "grad_norm": 1.3529000282287598, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7723 + }, + { + "epoch": 6.169329073482428, + "grad_norm": 0.06748781353235245, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7724 + }, + { + "epoch": 6.170127795527157, + "grad_norm": 0.06015792861580849, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7725 + }, + { + "epoch": 6.170926517571885, + "grad_norm": 0.07760192453861237, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7726 + }, + { + "epoch": 6.171725239616613, + "grad_norm": 0.09536328911781311, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7727 + }, + { + "epoch": 6.172523961661342, + "grad_norm": 0.051248203963041306, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7728 + }, + { + "epoch": 6.17332268370607, + "grad_norm": 0.09610000252723694, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7729 + }, + { + "epoch": 6.174121405750799, + "grad_norm": 0.0803515687584877, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7730 + }, + { + "epoch": 6.174920127795527, + "grad_norm": 0.0820179283618927, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7731 + }, + { + "epoch": 6.175718849840256, + "grad_norm": 0.08880780637264252, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7732 + }, + { + "epoch": 6.176517571884984, + "grad_norm": 0.12188591808080673, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7733 + }, + { + "epoch": 6.177316293929713, + "grad_norm": 0.06245967745780945, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7734 + }, + { + "epoch": 6.178115015974441, + "grad_norm": 0.06608586013317108, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7735 + }, + { + "epoch": 6.178913738019169, + "grad_norm": 0.08542132377624512, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7736 + }, + { + "epoch": 6.1797124600638975, + "grad_norm": 0.06510723382234573, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7737 + }, + { + "epoch": 6.180511182108626, + "grad_norm": 0.161012202501297, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7738 + }, + { + "epoch": 6.181309904153355, + "grad_norm": 0.07943159341812134, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7739 + }, + { + "epoch": 6.182108626198083, + "grad_norm": 0.07735269516706467, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7740 + }, + { + "epoch": 6.182907348242812, + "grad_norm": 0.07452470809221268, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7741 + }, + { + "epoch": 6.18370607028754, + "grad_norm": 0.06378357857465744, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7742 + }, + { + "epoch": 6.184504792332269, + "grad_norm": 0.06149968132376671, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7743 + }, + { + "epoch": 6.185303514376997, + "grad_norm": 0.06558738648891449, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7744 + }, + { + "epoch": 6.186102236421725, + "grad_norm": 0.06004631146788597, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7745 + }, + { + "epoch": 6.186900958466453, + "grad_norm": 0.09972328692674637, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7746 + }, + { + "epoch": 6.187699680511182, + "grad_norm": 0.059344276785850525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7747 + }, + { + "epoch": 6.18849840255591, + "grad_norm": 0.15083496272563934, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7748 + }, + { + "epoch": 6.189297124600639, + "grad_norm": 0.08041606843471527, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7749 + }, + { + "epoch": 6.1900958466453675, + "grad_norm": 0.0801318883895874, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7750 + }, + { + "epoch": 6.190894568690096, + "grad_norm": 0.13313926756381989, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7751 + }, + { + "epoch": 6.1916932907348246, + "grad_norm": 0.07887420803308487, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7752 + }, + { + "epoch": 6.192492012779553, + "grad_norm": 0.08653397113084793, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7753 + }, + { + "epoch": 6.193290734824281, + "grad_norm": 0.12184617668390274, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7754 + }, + { + "epoch": 6.194089456869009, + "grad_norm": 0.05356535315513611, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7755 + }, + { + "epoch": 6.194888178913738, + "grad_norm": 0.09529519081115723, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7756 + }, + { + "epoch": 6.195686900958466, + "grad_norm": 0.07658126950263977, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7757 + }, + { + "epoch": 6.196485623003195, + "grad_norm": 0.0785149484872818, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7758 + }, + { + "epoch": 6.197284345047923, + "grad_norm": 0.10748651623725891, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7759 + }, + { + "epoch": 6.198083067092652, + "grad_norm": 0.056907687336206436, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7760 + }, + { + "epoch": 6.19888178913738, + "grad_norm": 0.3713622987270355, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7761 + }, + { + "epoch": 6.199680511182109, + "grad_norm": 0.16671019792556763, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7762 + }, + { + "epoch": 6.2004792332268375, + "grad_norm": 0.10214395076036453, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7763 + }, + { + "epoch": 6.201277955271565, + "grad_norm": 0.09181013703346252, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7764 + }, + { + "epoch": 6.202076677316294, + "grad_norm": 0.18003405630588531, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7765 + }, + { + "epoch": 6.202875399361022, + "grad_norm": 0.1032429188489914, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7766 + }, + { + "epoch": 6.203674121405751, + "grad_norm": 0.06787005811929703, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7767 + }, + { + "epoch": 6.204472843450479, + "grad_norm": 0.09422674775123596, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7768 + }, + { + "epoch": 6.205271565495208, + "grad_norm": 0.04083932563662529, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7769 + }, + { + "epoch": 6.206070287539936, + "grad_norm": 0.1368017941713333, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7770 + }, + { + "epoch": 6.206869009584665, + "grad_norm": 0.23276877403259277, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7771 + }, + { + "epoch": 6.207667731629393, + "grad_norm": 0.13092860579490662, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7772 + }, + { + "epoch": 6.208466453674121, + "grad_norm": 0.14030441641807556, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7773 + }, + { + "epoch": 6.2092651757188495, + "grad_norm": 0.2016047090291977, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7774 + }, + { + "epoch": 6.210063897763578, + "grad_norm": 0.1224871277809143, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7775 + }, + { + "epoch": 6.210862619808307, + "grad_norm": 0.10741977393627167, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7776 + }, + { + "epoch": 6.211661341853035, + "grad_norm": 0.19775021076202393, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7777 + }, + { + "epoch": 6.212460063897764, + "grad_norm": 0.06731278449296951, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7778 + }, + { + "epoch": 6.213258785942492, + "grad_norm": 0.14070862531661987, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7779 + }, + { + "epoch": 6.214057507987221, + "grad_norm": 0.1267949938774109, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7780 + }, + { + "epoch": 6.214856230031949, + "grad_norm": 0.0694371834397316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7781 + }, + { + "epoch": 6.215654952076678, + "grad_norm": 0.12222267687320709, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7782 + }, + { + "epoch": 6.216453674121405, + "grad_norm": 0.1105445921421051, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7783 + }, + { + "epoch": 6.217252396166134, + "grad_norm": 0.05993608012795448, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7784 + }, + { + "epoch": 6.218051118210862, + "grad_norm": 0.11157821118831635, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7785 + }, + { + "epoch": 6.218849840255591, + "grad_norm": 0.05242336913943291, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7786 + }, + { + "epoch": 6.2196485623003195, + "grad_norm": 0.046115025877952576, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7787 + }, + { + "epoch": 6.220447284345048, + "grad_norm": 0.04029909893870354, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7788 + }, + { + "epoch": 6.2212460063897765, + "grad_norm": 0.057172924280166626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7789 + }, + { + "epoch": 6.222044728434505, + "grad_norm": 0.04958837479352951, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7790 + }, + { + "epoch": 6.222843450479234, + "grad_norm": 0.046313852071762085, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7791 + }, + { + "epoch": 6.223642172523961, + "grad_norm": 0.03824630752205849, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7792 + }, + { + "epoch": 6.22444089456869, + "grad_norm": 0.07159019261598587, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7793 + }, + { + "epoch": 6.225239616613418, + "grad_norm": 0.06316389888525009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7794 + }, + { + "epoch": 6.226038338658147, + "grad_norm": 0.088447704911232, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7795 + }, + { + "epoch": 6.226837060702875, + "grad_norm": 0.08749943226575851, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7796 + }, + { + "epoch": 6.227635782747604, + "grad_norm": 0.08757520467042923, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7797 + }, + { + "epoch": 6.228434504792332, + "grad_norm": 0.10777202993631363, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7798 + }, + { + "epoch": 6.229233226837061, + "grad_norm": 0.15780584514141083, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7799 + }, + { + "epoch": 6.2300319488817895, + "grad_norm": 0.10375814139842987, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7800 + }, + { + "epoch": 6.230830670926518, + "grad_norm": 0.3544321656227112, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7801 + }, + { + "epoch": 6.231629392971246, + "grad_norm": 0.11117644608020782, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7802 + }, + { + "epoch": 6.232428115015974, + "grad_norm": 0.13096286356449127, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7803 + }, + { + "epoch": 6.233226837060703, + "grad_norm": 0.2706630229949951, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7804 + }, + { + "epoch": 6.234025559105431, + "grad_norm": 0.05805981904268265, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7805 + }, + { + "epoch": 6.23482428115016, + "grad_norm": 0.14731241762638092, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7806 + }, + { + "epoch": 6.235623003194888, + "grad_norm": 0.08912478387355804, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7807 + }, + { + "epoch": 6.236421725239617, + "grad_norm": 0.15754206478595734, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7808 + }, + { + "epoch": 6.237220447284345, + "grad_norm": 0.21143318712711334, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7809 + }, + { + "epoch": 6.238019169329074, + "grad_norm": 0.11839418858289719, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7810 + }, + { + "epoch": 6.2388178913738015, + "grad_norm": 0.23939856886863708, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7811 + }, + { + "epoch": 6.23961661341853, + "grad_norm": 0.1438305526971817, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7812 + }, + { + "epoch": 6.2404153354632586, + "grad_norm": 0.11111237108707428, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7813 + }, + { + "epoch": 6.241214057507987, + "grad_norm": 0.19577394425868988, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7814 + }, + { + "epoch": 6.242012779552716, + "grad_norm": 0.1399260312318802, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7815 + }, + { + "epoch": 6.242811501597444, + "grad_norm": 0.16393627226352692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7816 + }, + { + "epoch": 6.243610223642173, + "grad_norm": 0.15071940422058105, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7817 + }, + { + "epoch": 6.244408945686901, + "grad_norm": 0.2121957242488861, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7818 + }, + { + "epoch": 6.24520766773163, + "grad_norm": 0.09854442626237869, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7819 + }, + { + "epoch": 6.246006389776358, + "grad_norm": 0.1327667534351349, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7820 + }, + { + "epoch": 6.246805111821086, + "grad_norm": 0.13909243047237396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7821 + }, + { + "epoch": 6.247603833865814, + "grad_norm": 0.08482292294502258, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7822 + }, + { + "epoch": 6.248402555910543, + "grad_norm": 0.0918656438589096, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7823 + }, + { + "epoch": 6.2492012779552715, + "grad_norm": 0.1352611631155014, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7824 + }, + { + "epoch": 6.25, + "grad_norm": 0.06178867816925049, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7825 + }, + { + "epoch": 6.2507987220447285, + "grad_norm": 0.1285342425107956, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7826 + }, + { + "epoch": 6.251597444089457, + "grad_norm": 0.17862951755523682, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7827 + }, + { + "epoch": 6.252396166134186, + "grad_norm": 0.574928343296051, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7828 + }, + { + "epoch": 6.253194888178914, + "grad_norm": 0.11522867530584335, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 7829 + }, + { + "epoch": 6.253993610223642, + "grad_norm": 0.08348001539707184, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7830 + }, + { + "epoch": 6.25479233226837, + "grad_norm": 0.1015007346868515, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7831 + }, + { + "epoch": 6.255591054313099, + "grad_norm": 0.18213561177253723, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7832 + }, + { + "epoch": 6.256389776357827, + "grad_norm": 0.1056833565235138, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7833 + }, + { + "epoch": 6.257188498402556, + "grad_norm": 0.09715890139341354, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7834 + }, + { + "epoch": 6.257987220447284, + "grad_norm": 0.17651355266571045, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7835 + }, + { + "epoch": 6.258785942492013, + "grad_norm": 0.11858265846967697, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7836 + }, + { + "epoch": 6.2595846645367414, + "grad_norm": 0.1400168240070343, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7837 + }, + { + "epoch": 6.26038338658147, + "grad_norm": 0.2133244276046753, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7838 + }, + { + "epoch": 6.261182108626198, + "grad_norm": 0.087309330701828, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7839 + }, + { + "epoch": 6.261980830670926, + "grad_norm": 0.07735110074281693, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7840 + }, + { + "epoch": 6.262779552715655, + "grad_norm": 0.08314932882785797, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7841 + }, + { + "epoch": 6.263578274760383, + "grad_norm": 0.13448217511177063, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7842 + }, + { + "epoch": 6.264376996805112, + "grad_norm": 1.4022712707519531, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7843 + }, + { + "epoch": 6.26517571884984, + "grad_norm": 0.1107354387640953, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7844 + }, + { + "epoch": 6.265974440894569, + "grad_norm": 0.17282478511333466, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7845 + }, + { + "epoch": 6.266773162939297, + "grad_norm": 0.0903516560792923, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7846 + }, + { + "epoch": 6.267571884984026, + "grad_norm": 0.07628770172595978, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7847 + }, + { + "epoch": 6.268370607028754, + "grad_norm": 0.08877440541982651, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7848 + }, + { + "epoch": 6.269169329073483, + "grad_norm": 0.041159700602293015, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7849 + }, + { + "epoch": 6.2699680511182105, + "grad_norm": 0.09187504649162292, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7850 + }, + { + "epoch": 6.270766773162939, + "grad_norm": 0.11252478510141373, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7851 + }, + { + "epoch": 6.271565495207668, + "grad_norm": 0.04354100301861763, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7852 + }, + { + "epoch": 6.272364217252396, + "grad_norm": 0.06845738738775253, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7853 + }, + { + "epoch": 6.273162939297125, + "grad_norm": 0.047235157340765, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7854 + }, + { + "epoch": 6.273961661341853, + "grad_norm": 0.04571741819381714, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7855 + }, + { + "epoch": 6.274760383386582, + "grad_norm": 0.09801016747951508, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7856 + }, + { + "epoch": 6.27555910543131, + "grad_norm": 0.12422922253608704, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7857 + }, + { + "epoch": 6.276357827476039, + "grad_norm": 0.07283129543066025, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7858 + }, + { + "epoch": 6.277156549520766, + "grad_norm": 0.07217510044574738, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7859 + }, + { + "epoch": 6.277955271565495, + "grad_norm": 0.1102033257484436, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7860 + }, + { + "epoch": 6.2787539936102235, + "grad_norm": 0.0814276710152626, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7861 + }, + { + "epoch": 6.279552715654952, + "grad_norm": 0.08247577399015427, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7862 + }, + { + "epoch": 6.2803514376996805, + "grad_norm": 0.04042622447013855, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7863 + }, + { + "epoch": 6.281150159744409, + "grad_norm": 0.049153268337249756, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7864 + }, + { + "epoch": 6.281948881789138, + "grad_norm": 0.07062675058841705, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7865 + }, + { + "epoch": 6.282747603833866, + "grad_norm": 0.06458686292171478, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7866 + }, + { + "epoch": 6.283546325878595, + "grad_norm": 0.093512162566185, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7867 + }, + { + "epoch": 6.284345047923322, + "grad_norm": 0.054384954273700714, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7868 + }, + { + "epoch": 6.285143769968051, + "grad_norm": 0.06253736466169357, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7869 + }, + { + "epoch": 6.285942492012779, + "grad_norm": 0.05566808953881264, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7870 + }, + { + "epoch": 6.286741214057508, + "grad_norm": 0.07693472504615784, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7871 + }, + { + "epoch": 6.287539936102236, + "grad_norm": 0.04471312463283539, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7872 + }, + { + "epoch": 6.288338658146965, + "grad_norm": 0.050770796835422516, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7873 + }, + { + "epoch": 6.289137380191693, + "grad_norm": 0.04736769199371338, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7874 + }, + { + "epoch": 6.289936102236422, + "grad_norm": 0.06550426036119461, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7875 + }, + { + "epoch": 6.2907348242811505, + "grad_norm": 0.0524384006857872, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7876 + }, + { + "epoch": 6.291533546325878, + "grad_norm": 0.10091802477836609, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7877 + }, + { + "epoch": 6.292332268370607, + "grad_norm": 0.14296530187129974, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7878 + }, + { + "epoch": 6.293130990415335, + "grad_norm": 0.08703069388866425, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7879 + }, + { + "epoch": 6.293929712460064, + "grad_norm": 0.05628393217921257, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7880 + }, + { + "epoch": 6.294728434504792, + "grad_norm": 0.09164825826883316, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7881 + }, + { + "epoch": 6.295527156549521, + "grad_norm": 0.09182474762201309, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7882 + }, + { + "epoch": 6.296325878594249, + "grad_norm": 0.03495810180902481, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7883 + }, + { + "epoch": 6.297124600638978, + "grad_norm": 0.07738466560840607, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7884 + }, + { + "epoch": 6.297923322683706, + "grad_norm": 0.06034242361783981, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7885 + }, + { + "epoch": 6.298722044728435, + "grad_norm": 0.04083844646811485, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7886 + }, + { + "epoch": 6.2995207667731625, + "grad_norm": 0.0918336734175682, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7887 + }, + { + "epoch": 6.300319488817891, + "grad_norm": 0.07351864874362946, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7888 + }, + { + "epoch": 6.30111821086262, + "grad_norm": 0.042986564338207245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7889 + }, + { + "epoch": 6.301916932907348, + "grad_norm": 0.05983031541109085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7890 + }, + { + "epoch": 6.302715654952077, + "grad_norm": 0.10980594903230667, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7891 + }, + { + "epoch": 6.303514376996805, + "grad_norm": 0.04517138749361038, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7892 + }, + { + "epoch": 6.304313099041534, + "grad_norm": 0.08489427715539932, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7893 + }, + { + "epoch": 6.305111821086262, + "grad_norm": 0.040421262383461, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7894 + }, + { + "epoch": 6.305910543130991, + "grad_norm": 0.0438009649515152, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7895 + }, + { + "epoch": 6.306709265175719, + "grad_norm": 0.05797100067138672, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7896 + }, + { + "epoch": 6.307507987220447, + "grad_norm": 0.08798980712890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7897 + }, + { + "epoch": 6.3083067092651754, + "grad_norm": 0.0502130500972271, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7898 + }, + { + "epoch": 6.309105431309904, + "grad_norm": 0.11610639840364456, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7899 + }, + { + "epoch": 6.3099041533546325, + "grad_norm": 0.061168819665908813, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7900 + }, + { + "epoch": 6.310702875399361, + "grad_norm": 0.0469425804913044, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7901 + }, + { + "epoch": 6.31150159744409, + "grad_norm": 0.0483059324324131, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7902 + }, + { + "epoch": 6.312300319488818, + "grad_norm": 0.120233453810215, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7903 + }, + { + "epoch": 6.313099041533547, + "grad_norm": 0.10025710612535477, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7904 + }, + { + "epoch": 6.313897763578275, + "grad_norm": 0.08750995993614197, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7905 + }, + { + "epoch": 6.314696485623003, + "grad_norm": 0.31308433413505554, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7906 + }, + { + "epoch": 6.315495207667731, + "grad_norm": 0.06390809267759323, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7907 + }, + { + "epoch": 6.31629392971246, + "grad_norm": 0.0657041072845459, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7908 + }, + { + "epoch": 6.317092651757188, + "grad_norm": 0.09626918286085129, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7909 + }, + { + "epoch": 6.317891373801917, + "grad_norm": 0.05565343424677849, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7910 + }, + { + "epoch": 6.318690095846645, + "grad_norm": 0.06147831678390503, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7911 + }, + { + "epoch": 6.319488817891374, + "grad_norm": 0.08704033493995667, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7912 + }, + { + "epoch": 6.3202875399361025, + "grad_norm": 0.04405020549893379, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7913 + }, + { + "epoch": 6.321086261980831, + "grad_norm": 0.07587708532810211, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7914 + }, + { + "epoch": 6.321884984025559, + "grad_norm": 0.05935811623930931, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7915 + }, + { + "epoch": 6.322683706070287, + "grad_norm": 0.045584313571453094, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7916 + }, + { + "epoch": 6.323482428115016, + "grad_norm": 0.065196193754673, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7917 + }, + { + "epoch": 6.324281150159744, + "grad_norm": 0.05996553227305412, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7918 + }, + { + "epoch": 6.325079872204473, + "grad_norm": 0.04771357774734497, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7919 + }, + { + "epoch": 6.325878594249201, + "grad_norm": 0.05875687673687935, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7920 + }, + { + "epoch": 6.32667731629393, + "grad_norm": 0.15765227377414703, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7921 + }, + { + "epoch": 6.327476038338658, + "grad_norm": 0.038563717156648636, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7922 + }, + { + "epoch": 6.328274760383387, + "grad_norm": 0.04321083426475525, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7923 + }, + { + "epoch": 6.329073482428115, + "grad_norm": 0.04427725449204445, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7924 + }, + { + "epoch": 6.329872204472843, + "grad_norm": 0.06047825515270233, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7925 + }, + { + "epoch": 6.330670926517572, + "grad_norm": 0.05161035805940628, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7926 + }, + { + "epoch": 6.3314696485623, + "grad_norm": 0.06512151658535004, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7927 + }, + { + "epoch": 6.332268370607029, + "grad_norm": 0.05178358778357506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7928 + }, + { + "epoch": 6.333067092651757, + "grad_norm": 0.06199260801076889, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7929 + }, + { + "epoch": 6.333865814696486, + "grad_norm": 0.09948168694972992, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7930 + }, + { + "epoch": 6.334664536741214, + "grad_norm": 0.06568150222301483, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7931 + }, + { + "epoch": 6.335463258785943, + "grad_norm": 0.036642882972955704, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7932 + }, + { + "epoch": 6.336261980830671, + "grad_norm": 0.04814688116312027, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7933 + }, + { + "epoch": 6.3370607028754, + "grad_norm": 0.03938854858279228, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7934 + }, + { + "epoch": 6.337859424920127, + "grad_norm": 0.07778320461511612, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7935 + }, + { + "epoch": 6.338658146964856, + "grad_norm": 0.16271090507507324, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7936 + }, + { + "epoch": 6.3394568690095845, + "grad_norm": 0.3652990460395813, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7937 + }, + { + "epoch": 6.340255591054313, + "grad_norm": 0.0592365525662899, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7938 + }, + { + "epoch": 6.3410543130990416, + "grad_norm": 0.28622883558273315, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7939 + }, + { + "epoch": 6.34185303514377, + "grad_norm": 0.2270730584859848, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7940 + }, + { + "epoch": 6.342651757188499, + "grad_norm": 0.10781756043434143, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7941 + }, + { + "epoch": 6.343450479233227, + "grad_norm": 0.11611706018447876, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7942 + }, + { + "epoch": 6.344249201277956, + "grad_norm": 0.08212626725435257, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7943 + }, + { + "epoch": 6.345047923322683, + "grad_norm": 0.0739196389913559, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7944 + }, + { + "epoch": 6.345846645367412, + "grad_norm": 0.1029743030667305, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7945 + }, + { + "epoch": 6.34664536741214, + "grad_norm": 0.2787686586380005, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7946 + }, + { + "epoch": 6.347444089456869, + "grad_norm": 0.12180152535438538, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7947 + }, + { + "epoch": 6.348242811501597, + "grad_norm": 0.178681880235672, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7948 + }, + { + "epoch": 6.349041533546326, + "grad_norm": 0.10219722986221313, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7949 + }, + { + "epoch": 6.3498402555910545, + "grad_norm": 0.0773158147931099, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7950 + }, + { + "epoch": 6.350638977635783, + "grad_norm": 0.15096192061901093, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7951 + }, + { + "epoch": 6.3514376996805115, + "grad_norm": 0.06237277388572693, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7952 + }, + { + "epoch": 6.352236421725239, + "grad_norm": 1.4819257259368896, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7953 + }, + { + "epoch": 6.353035143769968, + "grad_norm": 0.09716464579105377, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7954 + }, + { + "epoch": 6.353833865814696, + "grad_norm": 0.10105668753385544, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7955 + }, + { + "epoch": 6.354632587859425, + "grad_norm": 0.09361526370048523, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7956 + }, + { + "epoch": 6.355431309904153, + "grad_norm": 0.04209212213754654, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7957 + }, + { + "epoch": 6.356230031948882, + "grad_norm": 0.11653190106153488, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7958 + }, + { + "epoch": 6.35702875399361, + "grad_norm": 0.1552112102508545, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7959 + }, + { + "epoch": 6.357827476038339, + "grad_norm": 0.07934660464525223, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7960 + }, + { + "epoch": 6.358626198083067, + "grad_norm": 0.10928693413734436, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7961 + }, + { + "epoch": 6.359424920127796, + "grad_norm": 0.15923380851745605, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7962 + }, + { + "epoch": 6.360223642172524, + "grad_norm": 0.12151104211807251, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7963 + }, + { + "epoch": 6.361022364217252, + "grad_norm": 0.055971868336200714, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7964 + }, + { + "epoch": 6.361821086261981, + "grad_norm": 0.17611366510391235, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7965 + }, + { + "epoch": 6.362619808306709, + "grad_norm": 0.16098986566066742, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7966 + }, + { + "epoch": 6.363418530351438, + "grad_norm": 1.6793769598007202, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7967 + }, + { + "epoch": 6.364217252396166, + "grad_norm": 0.4322223365306854, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 7968 + }, + { + "epoch": 6.365015974440895, + "grad_norm": 0.35510173439979553, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 7969 + }, + { + "epoch": 6.365814696485623, + "grad_norm": 0.08799898624420166, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7970 + }, + { + "epoch": 6.366613418530352, + "grad_norm": 0.28774675726890564, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7971 + }, + { + "epoch": 6.36741214057508, + "grad_norm": 0.28109011054039, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7972 + }, + { + "epoch": 6.368210862619808, + "grad_norm": 0.09055986255407333, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7973 + }, + { + "epoch": 6.3690095846645365, + "grad_norm": 0.15083353221416473, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7974 + }, + { + "epoch": 6.369808306709265, + "grad_norm": 0.20686668157577515, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7975 + }, + { + "epoch": 6.3706070287539935, + "grad_norm": 0.047575660049915314, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 7976 + }, + { + "epoch": 6.371405750798722, + "grad_norm": 0.25424477458000183, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7977 + }, + { + "epoch": 6.372204472843451, + "grad_norm": 0.21839222311973572, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7978 + }, + { + "epoch": 6.373003194888179, + "grad_norm": 0.06493431329727173, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7979 + }, + { + "epoch": 6.373801916932908, + "grad_norm": 0.2369518280029297, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 7980 + }, + { + "epoch": 6.374600638977636, + "grad_norm": 0.14641214907169342, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7981 + }, + { + "epoch": 6.375399361022364, + "grad_norm": 0.11602997034788132, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7982 + }, + { + "epoch": 6.376198083067092, + "grad_norm": 0.18792425096035004, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7983 + }, + { + "epoch": 6.376996805111821, + "grad_norm": 0.06824373453855515, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7984 + }, + { + "epoch": 6.377795527156549, + "grad_norm": 0.1228032335639, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7985 + }, + { + "epoch": 6.378594249201278, + "grad_norm": 0.15771286189556122, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7986 + }, + { + "epoch": 6.3793929712460065, + "grad_norm": 0.1157795861363411, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7987 + }, + { + "epoch": 6.380191693290735, + "grad_norm": 0.07282877713441849, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7988 + }, + { + "epoch": 6.3809904153354635, + "grad_norm": 0.10168643295764923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7989 + }, + { + "epoch": 6.381789137380192, + "grad_norm": 0.24466580152511597, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7990 + }, + { + "epoch": 6.38258785942492, + "grad_norm": 0.0972297191619873, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7991 + }, + { + "epoch": 6.383386581469648, + "grad_norm": 0.08349917083978653, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7992 + }, + { + "epoch": 6.384185303514377, + "grad_norm": 0.058114584535360336, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7993 + }, + { + "epoch": 6.384984025559105, + "grad_norm": 0.04745171591639519, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7994 + }, + { + "epoch": 6.385782747603834, + "grad_norm": 0.05484034866094589, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7995 + }, + { + "epoch": 6.386581469648562, + "grad_norm": 0.05094960704445839, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7996 + }, + { + "epoch": 6.387380191693291, + "grad_norm": 0.06368618458509445, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7997 + }, + { + "epoch": 6.388178913738019, + "grad_norm": 0.07042541354894638, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7998 + }, + { + "epoch": 6.388977635782748, + "grad_norm": 0.06182365491986275, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7999 + }, + { + "epoch": 6.389776357827476, + "grad_norm": 0.05778853967785835, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8000 + }, + { + "epoch": 6.390575079872204, + "grad_norm": 0.04334365949034691, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8001 + }, + { + "epoch": 6.391373801916933, + "grad_norm": 0.08214148133993149, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8002 + }, + { + "epoch": 6.392172523961661, + "grad_norm": 0.05468964949250221, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8003 + }, + { + "epoch": 6.39297124600639, + "grad_norm": 0.07484348863363266, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8004 + }, + { + "epoch": 6.393769968051118, + "grad_norm": 0.04987887665629387, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8005 + }, + { + "epoch": 6.394568690095847, + "grad_norm": 0.05584597587585449, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8006 + }, + { + "epoch": 6.395367412140575, + "grad_norm": 0.07088904082775116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8007 + }, + { + "epoch": 6.396166134185304, + "grad_norm": 0.26695576310157776, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8008 + }, + { + "epoch": 6.396964856230032, + "grad_norm": 0.06452658027410507, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8009 + }, + { + "epoch": 6.397763578274761, + "grad_norm": 0.08994145691394806, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8010 + }, + { + "epoch": 6.3985623003194885, + "grad_norm": 0.06565240770578384, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8011 + }, + { + "epoch": 6.399361022364217, + "grad_norm": 0.0492648184299469, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8012 + }, + { + "epoch": 6.4001597444089455, + "grad_norm": 0.06946985423564911, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8013 + }, + { + "epoch": 6.400958466453674, + "grad_norm": 0.08669331669807434, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8014 + }, + { + "epoch": 6.401757188498403, + "grad_norm": 0.07930289953947067, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8015 + }, + { + "epoch": 6.402555910543131, + "grad_norm": 0.15216746926307678, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8016 + }, + { + "epoch": 6.40335463258786, + "grad_norm": 0.051862914115190506, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8017 + }, + { + "epoch": 6.404153354632588, + "grad_norm": 0.044119443744421005, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8018 + }, + { + "epoch": 6.404952076677317, + "grad_norm": 0.09787813574075699, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8019 + }, + { + "epoch": 6.405750798722044, + "grad_norm": 0.05269203707575798, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8020 + }, + { + "epoch": 6.406549520766773, + "grad_norm": 0.06683865934610367, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8021 + }, + { + "epoch": 6.407348242811501, + "grad_norm": 0.04334628954529762, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8022 + }, + { + "epoch": 6.40814696485623, + "grad_norm": 0.037559930235147476, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8023 + }, + { + "epoch": 6.4089456869009584, + "grad_norm": 0.21066749095916748, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8024 + }, + { + "epoch": 6.409744408945687, + "grad_norm": 0.05721563845872879, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8025 + }, + { + "epoch": 6.4105431309904155, + "grad_norm": 0.047683823853731155, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8026 + }, + { + "epoch": 6.411341853035144, + "grad_norm": 0.05377231910824776, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8027 + }, + { + "epoch": 6.412140575079873, + "grad_norm": 0.05604357272386551, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8028 + }, + { + "epoch": 6.4129392971246, + "grad_norm": 0.051680225878953934, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8029 + }, + { + "epoch": 6.413738019169329, + "grad_norm": 0.04465701803565025, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8030 + }, + { + "epoch": 6.414536741214057, + "grad_norm": 0.0454387366771698, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8031 + }, + { + "epoch": 6.415335463258786, + "grad_norm": 0.5079139471054077, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8032 + }, + { + "epoch": 6.416134185303514, + "grad_norm": 0.08386353403329849, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8033 + }, + { + "epoch": 6.416932907348243, + "grad_norm": 0.06023477017879486, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8034 + }, + { + "epoch": 6.417731629392971, + "grad_norm": 0.8634743094444275, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8035 + }, + { + "epoch": 6.4185303514377, + "grad_norm": 0.06926131993532181, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8036 + }, + { + "epoch": 6.419329073482428, + "grad_norm": 0.07563464343547821, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8037 + }, + { + "epoch": 6.420127795527157, + "grad_norm": 0.10181237757205963, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8038 + }, + { + "epoch": 6.420926517571885, + "grad_norm": 0.13995511829853058, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8039 + }, + { + "epoch": 6.421725239616613, + "grad_norm": 0.05968187376856804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8040 + }, + { + "epoch": 6.422523961661342, + "grad_norm": 0.14419680833816528, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8041 + }, + { + "epoch": 6.42332268370607, + "grad_norm": 0.13762469589710236, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8042 + }, + { + "epoch": 6.424121405750799, + "grad_norm": 0.0627644956111908, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8043 + }, + { + "epoch": 6.424920127795527, + "grad_norm": 0.1356768012046814, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8044 + }, + { + "epoch": 6.425718849840256, + "grad_norm": 0.12080833315849304, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8045 + }, + { + "epoch": 6.426517571884984, + "grad_norm": 0.048654112964868546, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8046 + }, + { + "epoch": 6.427316293929713, + "grad_norm": 0.11983022093772888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8047 + }, + { + "epoch": 6.428115015974441, + "grad_norm": 0.09429550170898438, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8048 + }, + { + "epoch": 6.428913738019169, + "grad_norm": 0.07924454659223557, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8049 + }, + { + "epoch": 6.4297124600638975, + "grad_norm": 0.15244926512241364, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8050 + }, + { + "epoch": 6.430511182108626, + "grad_norm": 0.9872325658798218, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8051 + }, + { + "epoch": 6.431309904153355, + "grad_norm": 0.0790395438671112, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8052 + }, + { + "epoch": 6.432108626198083, + "grad_norm": 0.3828068673610687, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8053 + }, + { + "epoch": 6.432907348242812, + "grad_norm": 0.059630244970321655, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8054 + }, + { + "epoch": 6.43370607028754, + "grad_norm": 0.07113327085971832, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8055 + }, + { + "epoch": 6.434504792332269, + "grad_norm": 0.0496523454785347, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8056 + }, + { + "epoch": 6.435303514376997, + "grad_norm": 0.08502436429262161, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8057 + }, + { + "epoch": 6.436102236421725, + "grad_norm": 0.06082376837730408, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8058 + }, + { + "epoch": 6.436900958466453, + "grad_norm": 0.1668524146080017, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8059 + }, + { + "epoch": 6.437699680511182, + "grad_norm": 0.05411513149738312, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8060 + }, + { + "epoch": 6.43849840255591, + "grad_norm": 0.05176519230008125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8061 + }, + { + "epoch": 6.439297124600639, + "grad_norm": 0.0684237852692604, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8062 + }, + { + "epoch": 6.4400958466453675, + "grad_norm": 0.0715038925409317, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8063 + }, + { + "epoch": 6.440894568690096, + "grad_norm": 0.11311113089323044, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8064 + }, + { + "epoch": 6.4416932907348246, + "grad_norm": 0.06320979446172714, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8065 + }, + { + "epoch": 6.442492012779553, + "grad_norm": 0.09221892803907394, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8066 + }, + { + "epoch": 6.443290734824281, + "grad_norm": 0.1183326244354248, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8067 + }, + { + "epoch": 6.444089456869009, + "grad_norm": 0.08447464555501938, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8068 + }, + { + "epoch": 6.444888178913738, + "grad_norm": 0.21791045367717743, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8069 + }, + { + "epoch": 6.445686900958466, + "grad_norm": 0.055015772581100464, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8070 + }, + { + "epoch": 6.446485623003195, + "grad_norm": 0.13536514341831207, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8071 + }, + { + "epoch": 6.447284345047923, + "grad_norm": 0.16620422899723053, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8072 + }, + { + "epoch": 6.448083067092652, + "grad_norm": 0.08793147653341293, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8073 + }, + { + "epoch": 6.44888178913738, + "grad_norm": 0.0962347462773323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8074 + }, + { + "epoch": 6.449680511182109, + "grad_norm": 0.08764681965112686, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 8075 + }, + { + "epoch": 6.4504792332268375, + "grad_norm": 0.06176106259226799, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8076 + }, + { + "epoch": 6.451277955271565, + "grad_norm": 0.06823577731847763, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8077 + }, + { + "epoch": 6.452076677316294, + "grad_norm": 0.11239560693502426, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8078 + }, + { + "epoch": 6.452875399361022, + "grad_norm": 0.10309527069330215, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8079 + }, + { + "epoch": 6.453674121405751, + "grad_norm": 0.07533836364746094, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8080 + }, + { + "epoch": 6.454472843450479, + "grad_norm": 0.06650671362876892, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8081 + }, + { + "epoch": 6.455271565495208, + "grad_norm": 0.1700691431760788, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8082 + }, + { + "epoch": 6.456070287539936, + "grad_norm": 0.06135572865605354, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8083 + }, + { + "epoch": 6.456869009584665, + "grad_norm": 0.08333424478769302, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8084 + }, + { + "epoch": 6.457667731629393, + "grad_norm": 0.1338927149772644, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8085 + }, + { + "epoch": 6.458466453674122, + "grad_norm": 0.07097163796424866, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8086 + }, + { + "epoch": 6.4592651757188495, + "grad_norm": 0.06296008080244064, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8087 + }, + { + "epoch": 6.460063897763578, + "grad_norm": 0.060656916350126266, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8088 + }, + { + "epoch": 6.460862619808307, + "grad_norm": 0.044889576733112335, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8089 + }, + { + "epoch": 6.461661341853035, + "grad_norm": 0.0749807357788086, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8090 + }, + { + "epoch": 6.462460063897764, + "grad_norm": 0.07509054243564606, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8091 + }, + { + "epoch": 6.463258785942492, + "grad_norm": 0.054954417049884796, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8092 + }, + { + "epoch": 6.464057507987221, + "grad_norm": 0.05087047815322876, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8093 + }, + { + "epoch": 6.464856230031949, + "grad_norm": 0.12205887585878372, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8094 + }, + { + "epoch": 6.465654952076678, + "grad_norm": 0.08342424035072327, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8095 + }, + { + "epoch": 6.466453674121405, + "grad_norm": 0.12507228553295135, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8096 + }, + { + "epoch": 6.467252396166134, + "grad_norm": 0.10491037368774414, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8097 + }, + { + "epoch": 6.468051118210862, + "grad_norm": 0.04236119985580444, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8098 + }, + { + "epoch": 6.468849840255591, + "grad_norm": 0.10601458698511124, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8099 + }, + { + "epoch": 6.4696485623003195, + "grad_norm": 0.07485921680927277, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8100 + }, + { + "epoch": 6.470447284345048, + "grad_norm": 0.06351220607757568, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8101 + }, + { + "epoch": 6.4712460063897765, + "grad_norm": 0.08351211249828339, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8102 + }, + { + "epoch": 6.472044728434505, + "grad_norm": 0.07205908000469208, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8103 + }, + { + "epoch": 6.472843450479234, + "grad_norm": 0.07072018831968307, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8104 + }, + { + "epoch": 6.473642172523961, + "grad_norm": 0.0851733461022377, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8105 + }, + { + "epoch": 6.47444089456869, + "grad_norm": 0.07046044617891312, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8106 + }, + { + "epoch": 6.475239616613418, + "grad_norm": 0.03804340958595276, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8107 + }, + { + "epoch": 6.476038338658147, + "grad_norm": 0.059083763509988785, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8108 + }, + { + "epoch": 6.476837060702875, + "grad_norm": 0.0419149249792099, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8109 + }, + { + "epoch": 6.477635782747604, + "grad_norm": 0.07814865559339523, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8110 + }, + { + "epoch": 6.478434504792332, + "grad_norm": 0.12653781473636627, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8111 + }, + { + "epoch": 6.479233226837061, + "grad_norm": 0.10124429315328598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8112 + }, + { + "epoch": 6.4800319488817895, + "grad_norm": 0.05563808232545853, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8113 + }, + { + "epoch": 6.480830670926517, + "grad_norm": 0.07036174833774567, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8114 + }, + { + "epoch": 6.481629392971246, + "grad_norm": 0.0452839694917202, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8115 + }, + { + "epoch": 6.482428115015974, + "grad_norm": 0.13880759477615356, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8116 + }, + { + "epoch": 6.483226837060703, + "grad_norm": 0.03902722895145416, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8117 + }, + { + "epoch": 6.484025559105431, + "grad_norm": 0.08136945217847824, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8118 + }, + { + "epoch": 6.48482428115016, + "grad_norm": 0.09874774515628815, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8119 + }, + { + "epoch": 6.485623003194888, + "grad_norm": 0.06836161017417908, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8120 + }, + { + "epoch": 6.486421725239617, + "grad_norm": 0.1439940482378006, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8121 + }, + { + "epoch": 6.487220447284345, + "grad_norm": 0.0924125388264656, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8122 + }, + { + "epoch": 6.488019169329074, + "grad_norm": 0.06811019778251648, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8123 + }, + { + "epoch": 6.488817891373802, + "grad_norm": 0.1259799599647522, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8124 + }, + { + "epoch": 6.48961661341853, + "grad_norm": 0.1088009849190712, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8125 + }, + { + "epoch": 6.4904153354632586, + "grad_norm": 0.27054721117019653, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8126 + }, + { + "epoch": 6.491214057507987, + "grad_norm": 0.09674181789159775, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8127 + }, + { + "epoch": 6.492012779552716, + "grad_norm": 0.15491390228271484, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8128 + }, + { + "epoch": 6.492811501597444, + "grad_norm": 0.08790267258882523, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8129 + }, + { + "epoch": 6.493610223642173, + "grad_norm": 0.19372408092021942, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8130 + }, + { + "epoch": 6.494408945686901, + "grad_norm": 0.14786171913146973, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8131 + }, + { + "epoch": 6.49520766773163, + "grad_norm": 0.09591338783502579, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8132 + }, + { + "epoch": 6.496006389776358, + "grad_norm": 0.1810663491487503, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8133 + }, + { + "epoch": 6.496805111821086, + "grad_norm": 0.19754691421985626, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8134 + }, + { + "epoch": 6.497603833865814, + "grad_norm": 0.14094877243041992, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8135 + }, + { + "epoch": 6.498402555910543, + "grad_norm": 0.0782506987452507, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8136 + }, + { + "epoch": 6.4992012779552715, + "grad_norm": 0.19543413817882538, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8137 + }, + { + "epoch": 6.5, + "grad_norm": 0.3102439045906067, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8138 + }, + { + "epoch": 6.5007987220447285, + "grad_norm": 0.13952040672302246, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8139 + }, + { + "epoch": 6.501597444089457, + "grad_norm": 0.1902403086423874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8140 + }, + { + "epoch": 6.502396166134186, + "grad_norm": 0.2608654499053955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8141 + }, + { + "epoch": 6.503194888178914, + "grad_norm": 0.22480152547359467, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8142 + }, + { + "epoch": 6.503993610223642, + "grad_norm": 0.21580660343170166, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8143 + }, + { + "epoch": 6.50479233226837, + "grad_norm": 0.1991831213235855, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8144 + }, + { + "epoch": 6.505591054313099, + "grad_norm": 0.25885632634162903, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8145 + }, + { + "epoch": 6.506389776357827, + "grad_norm": 0.2533574104309082, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8146 + }, + { + "epoch": 6.507188498402556, + "grad_norm": 0.11494381725788116, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8147 + }, + { + "epoch": 6.507987220447284, + "grad_norm": 0.1361113339662552, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8148 + }, + { + "epoch": 6.508785942492013, + "grad_norm": 0.22099947929382324, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8149 + }, + { + "epoch": 6.5095846645367414, + "grad_norm": 0.13223077356815338, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8150 + }, + { + "epoch": 6.51038338658147, + "grad_norm": 0.18203037977218628, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8151 + }, + { + "epoch": 6.511182108626198, + "grad_norm": 0.18066702783107758, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8152 + }, + { + "epoch": 6.511980830670926, + "grad_norm": 0.09984144568443298, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8153 + }, + { + "epoch": 6.512779552715655, + "grad_norm": 0.12803718447685242, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8154 + }, + { + "epoch": 6.513578274760383, + "grad_norm": 0.19731956720352173, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8155 + }, + { + "epoch": 6.514376996805112, + "grad_norm": 0.10687378793954849, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8156 + }, + { + "epoch": 6.51517571884984, + "grad_norm": 0.0971442237496376, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8157 + }, + { + "epoch": 6.515974440894569, + "grad_norm": 0.12840867042541504, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8158 + }, + { + "epoch": 6.516773162939297, + "grad_norm": 0.1245417669415474, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8159 + }, + { + "epoch": 6.517571884984026, + "grad_norm": 0.16850991547107697, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8160 + }, + { + "epoch": 6.518370607028754, + "grad_norm": 0.1931404322385788, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8161 + }, + { + "epoch": 6.519169329073483, + "grad_norm": 0.08180713653564453, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8162 + }, + { + "epoch": 6.5199680511182105, + "grad_norm": 0.24530328810214996, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8163 + }, + { + "epoch": 6.520766773162939, + "grad_norm": 0.14107894897460938, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8164 + }, + { + "epoch": 6.521565495207668, + "grad_norm": 0.07984111458063126, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8165 + }, + { + "epoch": 6.522364217252396, + "grad_norm": 0.20894968509674072, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8166 + }, + { + "epoch": 6.523162939297125, + "grad_norm": 0.09663927555084229, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8167 + }, + { + "epoch": 6.523961661341853, + "grad_norm": 0.0913434773683548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8168 + }, + { + "epoch": 6.524760383386582, + "grad_norm": 0.1247463971376419, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8169 + }, + { + "epoch": 6.52555910543131, + "grad_norm": 0.06504802405834198, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8170 + }, + { + "epoch": 6.526357827476039, + "grad_norm": 0.10900555551052094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8171 + }, + { + "epoch": 6.527156549520766, + "grad_norm": 0.047379642724990845, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8172 + }, + { + "epoch": 6.527955271565495, + "grad_norm": 0.17822134494781494, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8173 + }, + { + "epoch": 6.5287539936102235, + "grad_norm": 0.07658754289150238, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8174 + }, + { + "epoch": 6.529552715654952, + "grad_norm": 0.17294292151927948, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8175 + }, + { + "epoch": 6.5303514376996805, + "grad_norm": 0.07095851004123688, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8176 + }, + { + "epoch": 6.531150159744409, + "grad_norm": 0.07328472286462784, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8177 + }, + { + "epoch": 6.531948881789138, + "grad_norm": 0.11216691881418228, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8178 + }, + { + "epoch": 6.532747603833866, + "grad_norm": 0.3007374703884125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8179 + }, + { + "epoch": 6.533546325878595, + "grad_norm": 0.06059226021170616, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8180 + }, + { + "epoch": 6.534345047923322, + "grad_norm": 0.14438967406749725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8181 + }, + { + "epoch": 6.535143769968051, + "grad_norm": 0.1965394914150238, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8182 + }, + { + "epoch": 6.535942492012779, + "grad_norm": 0.130478173494339, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8183 + }, + { + "epoch": 6.536741214057508, + "grad_norm": 0.16713190078735352, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8184 + }, + { + "epoch": 6.537539936102236, + "grad_norm": 0.18644076585769653, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8185 + }, + { + "epoch": 6.538338658146965, + "grad_norm": 0.06685839593410492, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8186 + }, + { + "epoch": 6.539137380191693, + "grad_norm": 0.17819803953170776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8187 + }, + { + "epoch": 6.539936102236422, + "grad_norm": 0.5894746780395508, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8188 + }, + { + "epoch": 6.5407348242811505, + "grad_norm": 0.088719442486763, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8189 + }, + { + "epoch": 6.541533546325878, + "grad_norm": 0.1336045265197754, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8190 + }, + { + "epoch": 6.542332268370607, + "grad_norm": 0.12859520316123962, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8191 + }, + { + "epoch": 6.543130990415335, + "grad_norm": 0.13402487337589264, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8192 + }, + { + "epoch": 6.543929712460064, + "grad_norm": 0.11415290832519531, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8193 + }, + { + "epoch": 6.544728434504792, + "grad_norm": 0.1775715947151184, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8194 + }, + { + "epoch": 6.545527156549521, + "grad_norm": 0.6331294775009155, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8195 + }, + { + "epoch": 6.546325878594249, + "grad_norm": 0.09323445707559586, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8196 + }, + { + "epoch": 6.547124600638978, + "grad_norm": 0.1761421412229538, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8197 + }, + { + "epoch": 6.547923322683706, + "grad_norm": 0.09608824551105499, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8198 + }, + { + "epoch": 6.548722044728435, + "grad_norm": 0.07564207166433334, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8199 + }, + { + "epoch": 6.549520766773163, + "grad_norm": 0.08033318817615509, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8200 + }, + { + "epoch": 6.550319488817891, + "grad_norm": 0.13604776561260223, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8201 + }, + { + "epoch": 6.55111821086262, + "grad_norm": 0.1046299859881401, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8202 + }, + { + "epoch": 6.551916932907348, + "grad_norm": 0.23783712089061737, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8203 + }, + { + "epoch": 6.552715654952077, + "grad_norm": 0.07360750436782837, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8204 + }, + { + "epoch": 6.553514376996805, + "grad_norm": 0.07213526219129562, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8205 + }, + { + "epoch": 6.554313099041534, + "grad_norm": 0.12431066483259201, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8206 + }, + { + "epoch": 6.555111821086262, + "grad_norm": 0.09665104001760483, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8207 + }, + { + "epoch": 6.555910543130991, + "grad_norm": 0.22090987861156464, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8208 + }, + { + "epoch": 6.556709265175719, + "grad_norm": 0.14936690032482147, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8209 + }, + { + "epoch": 6.557507987220447, + "grad_norm": 0.09804648160934448, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8210 + }, + { + "epoch": 6.5583067092651754, + "grad_norm": 0.07829400897026062, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8211 + }, + { + "epoch": 6.559105431309904, + "grad_norm": 0.08218041807413101, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8212 + }, + { + "epoch": 6.5599041533546325, + "grad_norm": 0.08018422871828079, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8213 + }, + { + "epoch": 6.560702875399361, + "grad_norm": 0.07790627330541611, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8214 + }, + { + "epoch": 6.56150159744409, + "grad_norm": 0.12526501715183258, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8215 + }, + { + "epoch": 6.562300319488818, + "grad_norm": 0.15222279727458954, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8216 + }, + { + "epoch": 6.563099041533547, + "grad_norm": 0.19605369865894318, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8217 + }, + { + "epoch": 6.563897763578275, + "grad_norm": 1.4426831007003784, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8218 + }, + { + "epoch": 6.564696485623003, + "grad_norm": 0.184299498796463, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8219 + }, + { + "epoch": 6.565495207667731, + "grad_norm": 0.12029392272233963, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8220 + }, + { + "epoch": 6.56629392971246, + "grad_norm": 0.07442726939916611, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8221 + }, + { + "epoch": 6.567092651757188, + "grad_norm": 0.14331156015396118, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8222 + }, + { + "epoch": 6.567891373801917, + "grad_norm": 0.11202000081539154, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 8223 + }, + { + "epoch": 6.568690095846645, + "grad_norm": 0.10699515789747238, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8224 + }, + { + "epoch": 6.569488817891374, + "grad_norm": 0.07708705961704254, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8225 + }, + { + "epoch": 6.5702875399361025, + "grad_norm": 0.08026644587516785, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8226 + }, + { + "epoch": 6.571086261980831, + "grad_norm": 0.08694002777338028, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8227 + }, + { + "epoch": 6.571884984025559, + "grad_norm": 0.11824248731136322, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8228 + }, + { + "epoch": 6.572683706070287, + "grad_norm": 0.06505008041858673, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8229 + }, + { + "epoch": 6.573482428115016, + "grad_norm": 0.05341152846813202, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8230 + }, + { + "epoch": 6.574281150159744, + "grad_norm": 0.09604120999574661, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8231 + }, + { + "epoch": 6.575079872204473, + "grad_norm": 0.08336330950260162, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8232 + }, + { + "epoch": 6.575878594249201, + "grad_norm": 0.06368359923362732, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8233 + }, + { + "epoch": 6.57667731629393, + "grad_norm": 0.13115698099136353, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8234 + }, + { + "epoch": 6.577476038338658, + "grad_norm": 0.08847527951002121, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8235 + }, + { + "epoch": 6.578274760383387, + "grad_norm": 0.0458359532058239, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8236 + }, + { + "epoch": 6.5790734824281145, + "grad_norm": 0.10106709599494934, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8237 + }, + { + "epoch": 6.579872204472844, + "grad_norm": 0.06641486287117004, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8238 + }, + { + "epoch": 6.580670926517572, + "grad_norm": 0.0733480304479599, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8239 + }, + { + "epoch": 6.5814696485623, + "grad_norm": 0.07835566252470016, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8240 + }, + { + "epoch": 6.582268370607029, + "grad_norm": 0.13473013043403625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8241 + }, + { + "epoch": 6.583067092651757, + "grad_norm": 0.062259674072265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8242 + }, + { + "epoch": 6.583865814696486, + "grad_norm": 0.05236242339015007, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8243 + }, + { + "epoch": 6.584664536741214, + "grad_norm": 0.08255355805158615, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8244 + }, + { + "epoch": 6.585463258785943, + "grad_norm": 0.1182556301355362, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8245 + }, + { + "epoch": 6.586261980830671, + "grad_norm": 0.0555981881916523, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8246 + }, + { + "epoch": 6.5870607028754, + "grad_norm": 0.09490877389907837, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8247 + }, + { + "epoch": 6.587859424920127, + "grad_norm": 0.6106880903244019, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8248 + }, + { + "epoch": 6.588658146964856, + "grad_norm": 0.0474761538207531, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8249 + }, + { + "epoch": 6.5894568690095845, + "grad_norm": 0.1429997831583023, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8250 + }, + { + "epoch": 6.590255591054313, + "grad_norm": 0.0815487951040268, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8251 + }, + { + "epoch": 6.5910543130990416, + "grad_norm": 0.096903957426548, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8252 + }, + { + "epoch": 6.59185303514377, + "grad_norm": 0.17775478959083557, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8253 + }, + { + "epoch": 6.592651757188499, + "grad_norm": 0.11637275665998459, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8254 + }, + { + "epoch": 6.593450479233227, + "grad_norm": 0.08475788682699203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8255 + }, + { + "epoch": 6.594249201277956, + "grad_norm": 0.1786298304796219, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8256 + }, + { + "epoch": 6.595047923322683, + "grad_norm": 0.12316745519638062, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8257 + }, + { + "epoch": 6.595846645367412, + "grad_norm": 0.5367861986160278, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8258 + }, + { + "epoch": 6.59664536741214, + "grad_norm": 0.2289825677871704, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8259 + }, + { + "epoch": 6.597444089456869, + "grad_norm": 0.17333106696605682, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8260 + }, + { + "epoch": 6.598242811501597, + "grad_norm": 0.10858172923326492, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8261 + }, + { + "epoch": 6.599041533546326, + "grad_norm": 0.2013384997844696, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8262 + }, + { + "epoch": 6.5998402555910545, + "grad_norm": 0.13658639788627625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8263 + }, + { + "epoch": 6.600638977635783, + "grad_norm": 0.12755805253982544, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8264 + }, + { + "epoch": 6.6014376996805115, + "grad_norm": 0.18299050629138947, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 6.602236421725239, + "grad_norm": 0.07105828821659088, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8266 + }, + { + "epoch": 6.603035143769968, + "grad_norm": 0.13049830496311188, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8267 + }, + { + "epoch": 6.603833865814696, + "grad_norm": 0.16121532022953033, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8268 + }, + { + "epoch": 6.604632587859425, + "grad_norm": 0.07512015104293823, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8269 + }, + { + "epoch": 6.605431309904153, + "grad_norm": 0.17407254874706268, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8270 + }, + { + "epoch": 6.606230031948882, + "grad_norm": 0.11297854781150818, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8271 + }, + { + "epoch": 6.60702875399361, + "grad_norm": 0.2839175760746002, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8272 + }, + { + "epoch": 6.607827476038339, + "grad_norm": 0.07847599685192108, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8273 + }, + { + "epoch": 6.608626198083067, + "grad_norm": 0.08995212614536285, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8274 + }, + { + "epoch": 6.609424920127795, + "grad_norm": 0.07382770627737045, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8275 + }, + { + "epoch": 6.6102236421725244, + "grad_norm": 0.06170637533068657, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8276 + }, + { + "epoch": 6.611022364217252, + "grad_norm": 0.07311394810676575, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8277 + }, + { + "epoch": 6.611821086261981, + "grad_norm": 0.06827707588672638, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8278 + }, + { + "epoch": 6.612619808306709, + "grad_norm": 0.05261022970080376, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8279 + }, + { + "epoch": 6.613418530351438, + "grad_norm": 0.11326271295547485, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8280 + }, + { + "epoch": 6.614217252396166, + "grad_norm": 0.1652819961309433, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8281 + }, + { + "epoch": 6.615015974440895, + "grad_norm": 0.10749676078557968, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8282 + }, + { + "epoch": 6.615814696485623, + "grad_norm": 0.20359984040260315, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8283 + }, + { + "epoch": 6.616613418530352, + "grad_norm": 0.18771138787269592, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8284 + }, + { + "epoch": 6.61741214057508, + "grad_norm": 2.5382773876190186, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8285 + }, + { + "epoch": 6.618210862619808, + "grad_norm": 0.30566683411598206, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8286 + }, + { + "epoch": 6.6190095846645365, + "grad_norm": 0.3638366758823395, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8287 + }, + { + "epoch": 6.619808306709265, + "grad_norm": 0.10939022153615952, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8288 + }, + { + "epoch": 6.6206070287539935, + "grad_norm": 0.3243744969367981, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8289 + }, + { + "epoch": 6.621405750798722, + "grad_norm": 0.2703976333141327, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8290 + }, + { + "epoch": 6.622204472843451, + "grad_norm": 0.06998306512832642, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8291 + }, + { + "epoch": 6.623003194888179, + "grad_norm": 0.25409170985221863, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8292 + }, + { + "epoch": 6.623801916932908, + "grad_norm": 0.110246442258358, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8293 + }, + { + "epoch": 6.624600638977636, + "grad_norm": 0.1667647659778595, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8294 + }, + { + "epoch": 6.625399361022364, + "grad_norm": 0.17452718317508698, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8295 + }, + { + "epoch": 6.626198083067092, + "grad_norm": 0.11691702157258987, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8296 + }, + { + "epoch": 6.626996805111821, + "grad_norm": 0.14679500460624695, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8297 + }, + { + "epoch": 6.627795527156549, + "grad_norm": 0.06978808343410492, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8298 + }, + { + "epoch": 6.628594249201278, + "grad_norm": 0.36758533120155334, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8299 + }, + { + "epoch": 6.6293929712460065, + "grad_norm": 0.11101481318473816, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8300 + }, + { + "epoch": 6.630191693290735, + "grad_norm": 0.11762239784002304, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8301 + }, + { + "epoch": 6.6309904153354635, + "grad_norm": 0.11467000097036362, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8302 + }, + { + "epoch": 6.631789137380192, + "grad_norm": 0.14236292243003845, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8303 + }, + { + "epoch": 6.63258785942492, + "grad_norm": 0.050860557705163956, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8304 + }, + { + "epoch": 6.633386581469648, + "grad_norm": 0.07763084024190903, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 8305 + }, + { + "epoch": 6.634185303514377, + "grad_norm": 0.06728993356227875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8306 + }, + { + "epoch": 6.634984025559105, + "grad_norm": 0.06984454393386841, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8307 + }, + { + "epoch": 6.635782747603834, + "grad_norm": 0.09839699417352676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8308 + }, + { + "epoch": 6.636581469648562, + "grad_norm": 0.1262810379266739, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8309 + }, + { + "epoch": 6.637380191693291, + "grad_norm": 0.08147390931844711, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8310 + }, + { + "epoch": 6.638178913738019, + "grad_norm": 0.11567803472280502, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8311 + }, + { + "epoch": 6.638977635782748, + "grad_norm": 0.14972445368766785, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8312 + }, + { + "epoch": 6.6397763578274756, + "grad_norm": 0.2970331609249115, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8313 + }, + { + "epoch": 6.640575079872205, + "grad_norm": 0.05576174706220627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8314 + }, + { + "epoch": 6.641373801916933, + "grad_norm": 0.048716023564338684, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8315 + }, + { + "epoch": 6.642172523961661, + "grad_norm": 0.05986058712005615, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8316 + }, + { + "epoch": 6.64297124600639, + "grad_norm": 0.07985493540763855, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8317 + }, + { + "epoch": 6.643769968051118, + "grad_norm": 0.5361261963844299, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 8318 + }, + { + "epoch": 6.644568690095847, + "grad_norm": 0.15383858978748322, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8319 + }, + { + "epoch": 6.645367412140575, + "grad_norm": 0.17428068816661835, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8320 + }, + { + "epoch": 6.646166134185304, + "grad_norm": 0.09801791608333588, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8321 + }, + { + "epoch": 6.646964856230032, + "grad_norm": 0.11805883049964905, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8322 + }, + { + "epoch": 6.647763578274761, + "grad_norm": 0.13135986030101776, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8323 + }, + { + "epoch": 6.6485623003194885, + "grad_norm": 0.10351908206939697, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8324 + }, + { + "epoch": 6.649361022364217, + "grad_norm": 0.11086217314004898, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8325 + }, + { + "epoch": 6.6501597444089455, + "grad_norm": 0.1173853799700737, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8326 + }, + { + "epoch": 6.650958466453674, + "grad_norm": 0.10743618756532669, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8327 + }, + { + "epoch": 6.651757188498403, + "grad_norm": 0.5378667116165161, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8328 + }, + { + "epoch": 6.652555910543131, + "grad_norm": 0.5077546834945679, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8329 + }, + { + "epoch": 6.65335463258786, + "grad_norm": 0.21998530626296997, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8330 + }, + { + "epoch": 6.654153354632588, + "grad_norm": 0.1235295757651329, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8331 + }, + { + "epoch": 6.654952076677317, + "grad_norm": 0.7328196167945862, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 8332 + }, + { + "epoch": 6.655750798722044, + "grad_norm": 0.12249958515167236, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8333 + }, + { + "epoch": 6.656549520766773, + "grad_norm": 0.12837325036525726, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8334 + }, + { + "epoch": 6.657348242811501, + "grad_norm": 0.09456688165664673, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8335 + }, + { + "epoch": 6.65814696485623, + "grad_norm": 0.13044698536396027, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8336 + }, + { + "epoch": 6.6589456869009584, + "grad_norm": 0.13105876743793488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8337 + }, + { + "epoch": 6.659744408945687, + "grad_norm": 0.14498500525951385, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8338 + }, + { + "epoch": 6.6605431309904155, + "grad_norm": 0.08840721845626831, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8339 + }, + { + "epoch": 6.661341853035144, + "grad_norm": 1.276719570159912, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8340 + }, + { + "epoch": 6.662140575079873, + "grad_norm": 0.36189836263656616, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 8341 + }, + { + "epoch": 6.6629392971246, + "grad_norm": 0.6304068565368652, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 8342 + }, + { + "epoch": 6.663738019169329, + "grad_norm": 0.524870753288269, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 8343 + }, + { + "epoch": 6.664536741214057, + "grad_norm": 0.14638005197048187, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8344 + }, + { + "epoch": 6.665335463258786, + "grad_norm": 0.3090416491031647, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 8345 + }, + { + "epoch": 6.666134185303514, + "grad_norm": 0.1549086570739746, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8346 + }, + { + "epoch": 6.666932907348243, + "grad_norm": 0.36996960639953613, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8347 + }, + { + "epoch": 6.667731629392971, + "grad_norm": 0.4879205524921417, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 8348 + }, + { + "epoch": 6.6685303514377, + "grad_norm": 0.6129382848739624, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8349 + }, + { + "epoch": 6.669329073482428, + "grad_norm": 0.37913191318511963, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8350 + }, + { + "epoch": 6.670127795527156, + "grad_norm": 0.1678311973810196, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 8351 + }, + { + "epoch": 6.6709265175718855, + "grad_norm": 0.17131182551383972, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8352 + }, + { + "epoch": 6.671725239616613, + "grad_norm": 0.29875028133392334, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8353 + }, + { + "epoch": 6.672523961661342, + "grad_norm": 0.5288842916488647, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8354 + }, + { + "epoch": 6.67332268370607, + "grad_norm": 0.24637238681316376, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8355 + }, + { + "epoch": 6.674121405750799, + "grad_norm": 0.25089535117149353, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8356 + }, + { + "epoch": 6.674920127795527, + "grad_norm": 0.5517246723175049, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8357 + }, + { + "epoch": 6.675718849840256, + "grad_norm": 0.07291965931653976, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8358 + }, + { + "epoch": 6.676517571884984, + "grad_norm": 0.2561021149158478, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8359 + }, + { + "epoch": 6.677316293929713, + "grad_norm": 0.2184453308582306, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8360 + }, + { + "epoch": 6.678115015974441, + "grad_norm": 0.10715393722057343, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8361 + }, + { + "epoch": 6.678913738019169, + "grad_norm": 0.16824330389499664, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8362 + }, + { + "epoch": 6.6797124600638975, + "grad_norm": 0.22539092600345612, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8363 + }, + { + "epoch": 6.680511182108626, + "grad_norm": 0.11956257373094559, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8364 + }, + { + "epoch": 6.681309904153355, + "grad_norm": 0.2023434042930603, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8365 + }, + { + "epoch": 6.682108626198083, + "grad_norm": 0.26878416538238525, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8366 + }, + { + "epoch": 6.682907348242812, + "grad_norm": 0.11318770796060562, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8367 + }, + { + "epoch": 6.68370607028754, + "grad_norm": 0.29282090067863464, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8368 + }, + { + "epoch": 6.684504792332269, + "grad_norm": 0.23825445771217346, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8369 + }, + { + "epoch": 6.685303514376997, + "grad_norm": 0.27186012268066406, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 8370 + }, + { + "epoch": 6.686102236421725, + "grad_norm": 0.28540825843811035, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8371 + }, + { + "epoch": 6.686900958466453, + "grad_norm": 0.14273707568645477, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8372 + }, + { + "epoch": 6.687699680511182, + "grad_norm": 0.3684747815132141, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8373 + }, + { + "epoch": 6.68849840255591, + "grad_norm": 0.23812046647071838, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8374 + }, + { + "epoch": 6.689297124600639, + "grad_norm": 0.15459395945072174, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8375 + }, + { + "epoch": 6.6900958466453675, + "grad_norm": 0.28762584924697876, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8376 + }, + { + "epoch": 6.690894568690096, + "grad_norm": 0.16686615347862244, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8377 + }, + { + "epoch": 6.6916932907348246, + "grad_norm": 0.16456246376037598, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8378 + }, + { + "epoch": 6.692492012779553, + "grad_norm": 0.2991560399532318, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8379 + }, + { + "epoch": 6.693290734824281, + "grad_norm": 0.14811092615127563, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8380 + }, + { + "epoch": 6.694089456869009, + "grad_norm": 0.14380809664726257, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8381 + }, + { + "epoch": 6.694888178913738, + "grad_norm": 0.0801207646727562, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8382 + }, + { + "epoch": 6.695686900958466, + "grad_norm": 0.08404620736837387, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8383 + }, + { + "epoch": 6.696485623003195, + "grad_norm": 0.1137305274605751, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8384 + }, + { + "epoch": 6.697284345047923, + "grad_norm": 0.08207721263170242, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8385 + }, + { + "epoch": 6.698083067092652, + "grad_norm": 0.09234748780727386, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8386 + }, + { + "epoch": 6.69888178913738, + "grad_norm": 0.29589149355888367, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8387 + }, + { + "epoch": 6.699680511182109, + "grad_norm": 0.2142077386379242, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8388 + }, + { + "epoch": 6.700479233226837, + "grad_norm": 0.10343299061059952, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8389 + }, + { + "epoch": 6.701277955271565, + "grad_norm": 0.12988241016864777, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8390 + }, + { + "epoch": 6.702076677316294, + "grad_norm": 0.20497195422649384, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 8391 + }, + { + "epoch": 6.702875399361022, + "grad_norm": 0.10697030276060104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8392 + }, + { + "epoch": 6.703674121405751, + "grad_norm": 0.1844921112060547, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8393 + }, + { + "epoch": 6.704472843450479, + "grad_norm": 0.13283176720142365, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8394 + }, + { + "epoch": 6.705271565495208, + "grad_norm": 0.14544987678527832, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8395 + }, + { + "epoch": 6.706070287539936, + "grad_norm": 0.10253588855266571, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8396 + }, + { + "epoch": 6.706869009584665, + "grad_norm": 0.11183217167854309, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8397 + }, + { + "epoch": 6.707667731629393, + "grad_norm": 0.12705212831497192, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8398 + }, + { + "epoch": 6.708466453674122, + "grad_norm": 0.08835884928703308, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8399 + }, + { + "epoch": 6.7092651757188495, + "grad_norm": 0.22377537190914154, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8400 + }, + { + "epoch": 6.710063897763578, + "grad_norm": 0.7205986976623535, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8401 + }, + { + "epoch": 6.710862619808307, + "grad_norm": 0.07383892685174942, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8402 + }, + { + "epoch": 6.711661341853035, + "grad_norm": 0.11109078675508499, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8403 + }, + { + "epoch": 6.712460063897764, + "grad_norm": 0.10979527235031128, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8404 + }, + { + "epoch": 6.713258785942492, + "grad_norm": 0.062491416931152344, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8405 + }, + { + "epoch": 6.714057507987221, + "grad_norm": 0.11196211725473404, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8406 + }, + { + "epoch": 6.714856230031949, + "grad_norm": 0.07815852016210556, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8407 + }, + { + "epoch": 6.715654952076678, + "grad_norm": 3.9684712886810303, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8408 + }, + { + "epoch": 6.716453674121405, + "grad_norm": 0.11982189118862152, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8409 + }, + { + "epoch": 6.717252396166134, + "grad_norm": 0.22319400310516357, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8410 + }, + { + "epoch": 6.718051118210862, + "grad_norm": 0.0937948003411293, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8411 + }, + { + "epoch": 6.718849840255591, + "grad_norm": 0.09193865954875946, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8412 + }, + { + "epoch": 6.7196485623003195, + "grad_norm": 0.08838166296482086, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8413 + }, + { + "epoch": 6.720447284345048, + "grad_norm": 0.0960271805524826, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8414 + }, + { + "epoch": 6.7212460063897765, + "grad_norm": 0.07488188147544861, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8415 + }, + { + "epoch": 6.722044728434505, + "grad_norm": 0.08563253283500671, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8416 + }, + { + "epoch": 6.722843450479234, + "grad_norm": 0.16766750812530518, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8417 + }, + { + "epoch": 6.723642172523961, + "grad_norm": 0.12811559438705444, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8418 + }, + { + "epoch": 6.72444089456869, + "grad_norm": 0.12410838901996613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8419 + }, + { + "epoch": 6.725239616613418, + "grad_norm": 0.1354755014181137, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8420 + }, + { + "epoch": 6.726038338658147, + "grad_norm": 0.17771920561790466, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8421 + }, + { + "epoch": 6.726837060702875, + "grad_norm": 0.19576571881771088, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8422 + }, + { + "epoch": 6.727635782747604, + "grad_norm": 0.5415527820587158, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 8423 + }, + { + "epoch": 6.728434504792332, + "grad_norm": 0.6647717952728271, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8424 + }, + { + "epoch": 6.729233226837061, + "grad_norm": 0.16329380869865417, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8425 + }, + { + "epoch": 6.7300319488817895, + "grad_norm": 0.4046335518360138, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8426 + }, + { + "epoch": 6.730830670926517, + "grad_norm": 0.1817079335451126, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8427 + }, + { + "epoch": 6.731629392971246, + "grad_norm": 0.3438379466533661, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 8428 + }, + { + "epoch": 6.732428115015974, + "grad_norm": 0.48276495933532715, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8429 + }, + { + "epoch": 6.733226837060703, + "grad_norm": 0.4002913236618042, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8430 + }, + { + "epoch": 6.734025559105431, + "grad_norm": 0.37833303213119507, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8431 + }, + { + "epoch": 6.73482428115016, + "grad_norm": 0.26374873518943787, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8432 + }, + { + "epoch": 6.735623003194888, + "grad_norm": 0.19766554236412048, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8433 + }, + { + "epoch": 6.736421725239617, + "grad_norm": 0.1996731013059616, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8434 + }, + { + "epoch": 6.737220447284345, + "grad_norm": 0.19733403623104095, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8435 + }, + { + "epoch": 6.738019169329074, + "grad_norm": 0.24423246085643768, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8436 + }, + { + "epoch": 6.738817891373802, + "grad_norm": 0.4329655170440674, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8437 + }, + { + "epoch": 6.73961661341853, + "grad_norm": 0.6964716911315918, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8438 + }, + { + "epoch": 6.7404153354632586, + "grad_norm": 0.12961135804653168, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8439 + }, + { + "epoch": 6.741214057507987, + "grad_norm": 0.2783071994781494, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8440 + }, + { + "epoch": 6.742012779552716, + "grad_norm": 0.3446369767189026, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8441 + }, + { + "epoch": 6.742811501597444, + "grad_norm": 0.22592051327228546, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8442 + }, + { + "epoch": 6.743610223642173, + "grad_norm": 0.06710102409124374, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8443 + }, + { + "epoch": 6.744408945686901, + "grad_norm": 0.2268608957529068, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8444 + }, + { + "epoch": 6.74520766773163, + "grad_norm": 0.08200005441904068, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8445 + }, + { + "epoch": 6.746006389776358, + "grad_norm": 0.2357168197631836, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8446 + }, + { + "epoch": 6.746805111821086, + "grad_norm": 0.20047837495803833, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8447 + }, + { + "epoch": 6.747603833865814, + "grad_norm": 0.2309340387582779, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8448 + }, + { + "epoch": 6.748402555910543, + "grad_norm": 0.11635745316743851, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8449 + }, + { + "epoch": 6.7492012779552715, + "grad_norm": 0.4076550602912903, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8450 + }, + { + "epoch": 6.75, + "grad_norm": 0.3500226140022278, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8451 + }, + { + "epoch": 6.7507987220447285, + "grad_norm": 0.2993873357772827, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8452 + }, + { + "epoch": 6.751597444089457, + "grad_norm": 0.1099642813205719, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8453 + }, + { + "epoch": 6.752396166134186, + "grad_norm": 0.17455045878887177, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8454 + }, + { + "epoch": 6.753194888178914, + "grad_norm": 0.12831585109233856, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8455 + }, + { + "epoch": 6.753993610223642, + "grad_norm": 0.1048964336514473, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8456 + }, + { + "epoch": 6.75479233226837, + "grad_norm": 0.16713464260101318, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8457 + }, + { + "epoch": 6.755591054313099, + "grad_norm": 0.07837880402803421, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8458 + }, + { + "epoch": 6.756389776357827, + "grad_norm": 0.17375724017620087, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8459 + }, + { + "epoch": 6.757188498402556, + "grad_norm": 0.9700595140457153, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8460 + }, + { + "epoch": 6.757987220447284, + "grad_norm": 0.23614056408405304, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8461 + }, + { + "epoch": 6.758785942492013, + "grad_norm": 0.2536165416240692, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 8462 + }, + { + "epoch": 6.7595846645367414, + "grad_norm": 0.26688873767852783, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 8463 + }, + { + "epoch": 6.76038338658147, + "grad_norm": 0.3807159662246704, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 8464 + }, + { + "epoch": 6.761182108626198, + "grad_norm": 0.2132156789302826, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8465 + }, + { + "epoch": 6.761980830670926, + "grad_norm": 0.19821512699127197, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 8466 + }, + { + "epoch": 6.762779552715655, + "grad_norm": 0.23694948852062225, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 8467 + }, + { + "epoch": 6.763578274760383, + "grad_norm": 0.1524704396724701, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 8468 + }, + { + "epoch": 6.764376996805112, + "grad_norm": 0.26719930768013, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 8469 + }, + { + "epoch": 6.76517571884984, + "grad_norm": 0.12077363580465317, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8470 + }, + { + "epoch": 6.765974440894569, + "grad_norm": 0.14398355782032013, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8471 + }, + { + "epoch": 6.766773162939297, + "grad_norm": 0.1972649097442627, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8472 + }, + { + "epoch": 6.767571884984026, + "grad_norm": 0.10172676295042038, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8473 + }, + { + "epoch": 6.768370607028754, + "grad_norm": 0.10743385553359985, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8474 + }, + { + "epoch": 6.769169329073483, + "grad_norm": 0.06148320063948631, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8475 + }, + { + "epoch": 6.7699680511182105, + "grad_norm": 0.08771604299545288, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8476 + }, + { + "epoch": 6.770766773162939, + "grad_norm": 0.13444122672080994, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8477 + }, + { + "epoch": 6.771565495207668, + "grad_norm": 0.4677158296108246, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8478 + }, + { + "epoch": 6.772364217252396, + "grad_norm": 0.08972432464361191, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8479 + }, + { + "epoch": 6.773162939297125, + "grad_norm": 0.10502214729785919, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8480 + }, + { + "epoch": 6.773961661341853, + "grad_norm": 0.14014923572540283, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8481 + }, + { + "epoch": 6.774760383386582, + "grad_norm": 0.3244888484477997, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8482 + }, + { + "epoch": 6.77555910543131, + "grad_norm": 0.20495742559432983, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8483 + }, + { + "epoch": 6.776357827476039, + "grad_norm": 0.15609663724899292, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8484 + }, + { + "epoch": 6.777156549520766, + "grad_norm": 0.13948239386081696, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8485 + }, + { + "epoch": 6.777955271565495, + "grad_norm": 0.28558677434921265, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8486 + }, + { + "epoch": 6.7787539936102235, + "grad_norm": 0.1481117457151413, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8487 + }, + { + "epoch": 6.779552715654952, + "grad_norm": 0.31998512148857117, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8488 + }, + { + "epoch": 6.7803514376996805, + "grad_norm": 0.1945921927690506, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8489 + }, + { + "epoch": 6.781150159744409, + "grad_norm": 18.217361450195312, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8490 + }, + { + "epoch": 6.781948881789138, + "grad_norm": 0.23472756147384644, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 8491 + }, + { + "epoch": 6.782747603833866, + "grad_norm": 0.10026291757822037, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8492 + }, + { + "epoch": 6.783546325878595, + "grad_norm": 0.14418581128120422, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8493 + }, + { + "epoch": 6.784345047923322, + "grad_norm": 0.14439892768859863, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8494 + }, + { + "epoch": 6.785143769968051, + "grad_norm": 0.37140071392059326, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8495 + }, + { + "epoch": 6.785942492012779, + "grad_norm": 0.09995266050100327, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8496 + }, + { + "epoch": 6.786741214057508, + "grad_norm": 0.08430355042219162, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8497 + }, + { + "epoch": 6.787539936102236, + "grad_norm": 0.11121980845928192, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8498 + }, + { + "epoch": 6.788338658146965, + "grad_norm": 0.20520392060279846, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8499 + }, + { + "epoch": 6.789137380191693, + "grad_norm": 0.10163573920726776, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8500 + }, + { + "epoch": 6.789936102236422, + "grad_norm": 0.12025435268878937, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8501 + }, + { + "epoch": 6.7907348242811505, + "grad_norm": 0.12003593891859055, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8502 + }, + { + "epoch": 6.791533546325878, + "grad_norm": 0.11013154685497284, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8503 + }, + { + "epoch": 6.792332268370607, + "grad_norm": 0.10089465230703354, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8504 + }, + { + "epoch": 6.793130990415335, + "grad_norm": 0.06270314007997513, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8505 + }, + { + "epoch": 6.793929712460064, + "grad_norm": 0.08571597188711166, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8506 + }, + { + "epoch": 6.794728434504792, + "grad_norm": 0.5324975848197937, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8507 + }, + { + "epoch": 6.795527156549521, + "grad_norm": 0.24500170350074768, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8508 + }, + { + "epoch": 6.796325878594249, + "grad_norm": 0.10234003514051437, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8509 + }, + { + "epoch": 6.797124600638978, + "grad_norm": 0.09924131631851196, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8510 + }, + { + "epoch": 6.797923322683706, + "grad_norm": 0.1413181573152542, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8511 + }, + { + "epoch": 6.798722044728435, + "grad_norm": 0.12095441669225693, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8512 + }, + { + "epoch": 6.799520766773163, + "grad_norm": 0.08617071062326431, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8513 + }, + { + "epoch": 6.800319488817891, + "grad_norm": 0.17984576523303986, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 8514 + }, + { + "epoch": 6.80111821086262, + "grad_norm": 0.16447608172893524, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8515 + }, + { + "epoch": 6.801916932907348, + "grad_norm": 0.15486668050289154, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8516 + }, + { + "epoch": 6.802715654952077, + "grad_norm": 0.10176295787096024, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8517 + }, + { + "epoch": 6.803514376996805, + "grad_norm": 0.14911721646785736, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8518 + }, + { + "epoch": 6.804313099041534, + "grad_norm": 0.11073625087738037, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8519 + }, + { + "epoch": 6.805111821086262, + "grad_norm": 0.10299605876207352, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8520 + }, + { + "epoch": 6.805910543130991, + "grad_norm": 0.189669668674469, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8521 + }, + { + "epoch": 6.806709265175719, + "grad_norm": 0.12226799875497818, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8522 + }, + { + "epoch": 6.807507987220447, + "grad_norm": 0.17778469622135162, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8523 + }, + { + "epoch": 6.8083067092651754, + "grad_norm": 0.16370487213134766, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8524 + }, + { + "epoch": 6.809105431309904, + "grad_norm": 0.05171172693371773, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8525 + }, + { + "epoch": 6.8099041533546325, + "grad_norm": 0.16393537819385529, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8526 + }, + { + "epoch": 6.810702875399361, + "grad_norm": 0.09398743510246277, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8527 + }, + { + "epoch": 6.81150159744409, + "grad_norm": 0.08430743217468262, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8528 + }, + { + "epoch": 6.812300319488818, + "grad_norm": 0.1131691113114357, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 8529 + }, + { + "epoch": 6.813099041533547, + "grad_norm": 0.0907130092382431, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8530 + }, + { + "epoch": 6.813897763578275, + "grad_norm": 0.1460096687078476, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8531 + }, + { + "epoch": 6.814696485623003, + "grad_norm": 0.07953288406133652, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8532 + }, + { + "epoch": 6.815495207667731, + "grad_norm": 0.061827294528484344, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 8533 + }, + { + "epoch": 6.81629392971246, + "grad_norm": 0.09172365814447403, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8534 + }, + { + "epoch": 6.817092651757188, + "grad_norm": 0.05858466029167175, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8535 + }, + { + "epoch": 6.817891373801917, + "grad_norm": 0.13774308562278748, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8536 + }, + { + "epoch": 6.818690095846645, + "grad_norm": 0.09840130060911179, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8537 + }, + { + "epoch": 6.819488817891374, + "grad_norm": 0.06836584210395813, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8538 + }, + { + "epoch": 6.8202875399361025, + "grad_norm": 0.15930971503257751, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8539 + }, + { + "epoch": 6.821086261980831, + "grad_norm": 0.12306738644838333, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8540 + }, + { + "epoch": 6.821884984025559, + "grad_norm": 0.09868071228265762, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8541 + }, + { + "epoch": 6.822683706070287, + "grad_norm": 0.09411876648664474, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8542 + }, + { + "epoch": 6.823482428115016, + "grad_norm": 0.09062112122774124, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8543 + }, + { + "epoch": 6.824281150159744, + "grad_norm": 0.14964330196380615, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8544 + }, + { + "epoch": 6.825079872204473, + "grad_norm": 0.1444161832332611, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8545 + }, + { + "epoch": 6.825878594249201, + "grad_norm": 0.15247556567192078, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8546 + }, + { + "epoch": 6.82667731629393, + "grad_norm": 0.1556181013584137, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8547 + }, + { + "epoch": 6.827476038338658, + "grad_norm": 0.1781637817621231, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8548 + }, + { + "epoch": 6.828274760383387, + "grad_norm": 0.10066398978233337, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8549 + }, + { + "epoch": 6.8290734824281145, + "grad_norm": 3.0298452377319336, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8550 + }, + { + "epoch": 6.829872204472844, + "grad_norm": 0.2745296061038971, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8551 + }, + { + "epoch": 6.830670926517572, + "grad_norm": 0.4030947983264923, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 8552 + }, + { + "epoch": 6.8314696485623, + "grad_norm": 0.11019638180732727, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8553 + }, + { + "epoch": 6.832268370607029, + "grad_norm": 0.33687886595726013, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8554 + }, + { + "epoch": 6.833067092651757, + "grad_norm": 0.164499431848526, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8555 + }, + { + "epoch": 6.833865814696486, + "grad_norm": 0.31624776124954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8556 + }, + { + "epoch": 6.834664536741214, + "grad_norm": 0.24264110624790192, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8557 + }, + { + "epoch": 6.835463258785943, + "grad_norm": 0.19310493767261505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8558 + }, + { + "epoch": 6.836261980830671, + "grad_norm": 0.2903575003147125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8559 + }, + { + "epoch": 6.8370607028754, + "grad_norm": 0.22584185004234314, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8560 + }, + { + "epoch": 6.837859424920127, + "grad_norm": 0.2400067150592804, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8561 + }, + { + "epoch": 6.838658146964856, + "grad_norm": 0.22543750703334808, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8562 + }, + { + "epoch": 6.8394568690095845, + "grad_norm": 0.2071310430765152, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8563 + }, + { + "epoch": 6.840255591054313, + "grad_norm": 0.07198980450630188, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8564 + }, + { + "epoch": 6.8410543130990416, + "grad_norm": 0.14733794331550598, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8565 + }, + { + "epoch": 6.84185303514377, + "grad_norm": 0.10259919613599777, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8566 + }, + { + "epoch": 6.842651757188499, + "grad_norm": 0.11961761116981506, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8567 + }, + { + "epoch": 6.843450479233227, + "grad_norm": 0.2714863121509552, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8568 + }, + { + "epoch": 6.844249201277956, + "grad_norm": 0.23675218224525452, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8569 + }, + { + "epoch": 6.845047923322683, + "grad_norm": 0.17738480865955353, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8570 + }, + { + "epoch": 6.845846645367412, + "grad_norm": 0.2558303475379944, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8571 + }, + { + "epoch": 6.84664536741214, + "grad_norm": 0.19869430363178253, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8572 + }, + { + "epoch": 6.847444089456869, + "grad_norm": 0.15806829929351807, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8573 + }, + { + "epoch": 6.848242811501597, + "grad_norm": 0.12016306072473526, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8574 + }, + { + "epoch": 6.849041533546326, + "grad_norm": 0.10831576585769653, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8575 + }, + { + "epoch": 6.8498402555910545, + "grad_norm": 0.06762730330228806, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8576 + }, + { + "epoch": 6.850638977635783, + "grad_norm": 0.0824534222483635, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8577 + }, + { + "epoch": 6.8514376996805115, + "grad_norm": 0.20734307169914246, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8578 + }, + { + "epoch": 6.852236421725239, + "grad_norm": 0.22174668312072754, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8579 + }, + { + "epoch": 6.853035143769968, + "grad_norm": 0.05667027458548546, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8580 + }, + { + "epoch": 6.853833865814696, + "grad_norm": 0.2844708561897278, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8581 + }, + { + "epoch": 6.854632587859425, + "grad_norm": 0.21092848479747772, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8582 + }, + { + "epoch": 6.855431309904153, + "grad_norm": 0.08843044936656952, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8583 + }, + { + "epoch": 6.856230031948882, + "grad_norm": 0.08862966299057007, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8584 + }, + { + "epoch": 6.85702875399361, + "grad_norm": 0.13263291120529175, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8585 + }, + { + "epoch": 6.857827476038339, + "grad_norm": 0.1969175636768341, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8586 + }, + { + "epoch": 6.858626198083067, + "grad_norm": 0.1299106925725937, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8587 + }, + { + "epoch": 6.859424920127795, + "grad_norm": 0.058154329657554626, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8588 + }, + { + "epoch": 6.8602236421725244, + "grad_norm": 0.06485166400671005, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8589 + }, + { + "epoch": 6.861022364217252, + "grad_norm": 6.880006313323975, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8590 + }, + { + "epoch": 6.861821086261981, + "grad_norm": 0.09929946064949036, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8591 + }, + { + "epoch": 6.862619808306709, + "grad_norm": 0.11197477579116821, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8592 + }, + { + "epoch": 6.863418530351438, + "grad_norm": 0.06740657985210419, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8593 + }, + { + "epoch": 6.864217252396166, + "grad_norm": 0.19594676792621613, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8594 + }, + { + "epoch": 6.865015974440895, + "grad_norm": 0.16844215989112854, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8595 + }, + { + "epoch": 6.865814696485623, + "grad_norm": 0.08980540931224823, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8596 + }, + { + "epoch": 6.866613418530352, + "grad_norm": 0.1263660043478012, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8597 + }, + { + "epoch": 6.86741214057508, + "grad_norm": 0.2000604271888733, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8598 + }, + { + "epoch": 6.868210862619808, + "grad_norm": 0.08987699449062347, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8599 + }, + { + "epoch": 6.8690095846645365, + "grad_norm": 0.12263453006744385, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8600 + }, + { + "epoch": 6.869808306709265, + "grad_norm": 0.1567721962928772, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8601 + }, + { + "epoch": 6.8706070287539935, + "grad_norm": 0.08756576478481293, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8602 + }, + { + "epoch": 6.871405750798722, + "grad_norm": 0.11816724389791489, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8603 + }, + { + "epoch": 6.872204472843451, + "grad_norm": 0.13798843324184418, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8604 + }, + { + "epoch": 6.873003194888179, + "grad_norm": 0.12364917248487473, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8605 + }, + { + "epoch": 6.873801916932908, + "grad_norm": 0.1200469508767128, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8606 + }, + { + "epoch": 6.874600638977636, + "grad_norm": 0.12144476920366287, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8607 + }, + { + "epoch": 6.875399361022364, + "grad_norm": 0.20083829760551453, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8608 + }, + { + "epoch": 6.876198083067092, + "grad_norm": 0.2817170023918152, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8609 + }, + { + "epoch": 6.876996805111821, + "grad_norm": 0.12137018889188766, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 8610 + }, + { + "epoch": 6.877795527156549, + "grad_norm": 0.09903489053249359, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8611 + }, + { + "epoch": 6.878594249201278, + "grad_norm": 0.17958515882492065, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8612 + }, + { + "epoch": 6.8793929712460065, + "grad_norm": 0.1041099801659584, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8613 + }, + { + "epoch": 6.880191693290735, + "grad_norm": 0.16099892556667328, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8614 + }, + { + "epoch": 6.8809904153354635, + "grad_norm": 0.061900194734334946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8615 + }, + { + "epoch": 6.881789137380192, + "grad_norm": 0.1341199427843094, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8616 + }, + { + "epoch": 6.88258785942492, + "grad_norm": 0.12683184444904327, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8617 + }, + { + "epoch": 6.883386581469648, + "grad_norm": 0.08566799014806747, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8618 + }, + { + "epoch": 6.884185303514377, + "grad_norm": 0.1616903841495514, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8619 + }, + { + "epoch": 6.884984025559105, + "grad_norm": 0.05832672119140625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8620 + }, + { + "epoch": 6.885782747603834, + "grad_norm": 0.15186071395874023, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8621 + }, + { + "epoch": 6.886581469648562, + "grad_norm": 0.16585935652256012, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8622 + }, + { + "epoch": 6.887380191693291, + "grad_norm": 0.1267954260110855, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8623 + }, + { + "epoch": 6.888178913738019, + "grad_norm": 0.22396692633628845, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8624 + }, + { + "epoch": 6.888977635782748, + "grad_norm": 0.133334219455719, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8625 + }, + { + "epoch": 6.8897763578274756, + "grad_norm": 0.1935819834470749, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8626 + }, + { + "epoch": 6.890575079872205, + "grad_norm": 0.32829585671424866, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8627 + }, + { + "epoch": 6.891373801916933, + "grad_norm": 0.231554314494133, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8628 + }, + { + "epoch": 6.892172523961661, + "grad_norm": 0.20693574845790863, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8629 + }, + { + "epoch": 6.89297124600639, + "grad_norm": 0.21037861704826355, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8630 + }, + { + "epoch": 6.893769968051118, + "grad_norm": 0.051133595407009125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8631 + }, + { + "epoch": 6.894568690095847, + "grad_norm": 0.17635062336921692, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8632 + }, + { + "epoch": 6.895367412140575, + "grad_norm": 0.14592808485031128, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8633 + }, + { + "epoch": 6.896166134185304, + "grad_norm": 0.15353697538375854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8634 + }, + { + "epoch": 6.896964856230032, + "grad_norm": 0.19556251168251038, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8635 + }, + { + "epoch": 6.897763578274761, + "grad_norm": 0.06867649406194687, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8636 + }, + { + "epoch": 6.8985623003194885, + "grad_norm": 0.15286169946193695, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8637 + }, + { + "epoch": 6.899361022364217, + "grad_norm": 0.28361746668815613, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8638 + }, + { + "epoch": 6.9001597444089455, + "grad_norm": 0.09351217746734619, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8639 + }, + { + "epoch": 6.900958466453674, + "grad_norm": 0.11050279438495636, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8640 + }, + { + "epoch": 6.901757188498403, + "grad_norm": 0.1648218333721161, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8641 + }, + { + "epoch": 6.902555910543131, + "grad_norm": 0.10323848575353622, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8642 + }, + { + "epoch": 6.90335463258786, + "grad_norm": 0.14925505220890045, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8643 + }, + { + "epoch": 6.904153354632588, + "grad_norm": 0.05877414718270302, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8644 + }, + { + "epoch": 6.904952076677317, + "grad_norm": 0.3324354290962219, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8645 + }, + { + "epoch": 6.905750798722044, + "grad_norm": 0.22756889462471008, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8646 + }, + { + "epoch": 6.906549520766773, + "grad_norm": 0.1040947288274765, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8647 + }, + { + "epoch": 6.907348242811501, + "grad_norm": 0.1310190111398697, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8648 + }, + { + "epoch": 6.90814696485623, + "grad_norm": 0.09484609216451645, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8649 + }, + { + "epoch": 6.9089456869009584, + "grad_norm": 0.13337384164333344, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8650 + }, + { + "epoch": 6.909744408945687, + "grad_norm": 0.31157273054122925, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8651 + }, + { + "epoch": 6.9105431309904155, + "grad_norm": 0.15081669390201569, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8652 + }, + { + "epoch": 6.911341853035144, + "grad_norm": 0.14120221138000488, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8653 + }, + { + "epoch": 6.912140575079873, + "grad_norm": 0.6128084659576416, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8654 + }, + { + "epoch": 6.9129392971246, + "grad_norm": 0.6915252208709717, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 8655 + }, + { + "epoch": 6.913738019169329, + "grad_norm": 0.7245156168937683, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 8656 + }, + { + "epoch": 6.914536741214057, + "grad_norm": 0.8400923013687134, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 8657 + }, + { + "epoch": 6.915335463258786, + "grad_norm": 0.3218044340610504, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 8658 + }, + { + "epoch": 6.916134185303514, + "grad_norm": 0.3119533061981201, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 8659 + }, + { + "epoch": 6.916932907348243, + "grad_norm": 0.2192138433456421, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 8660 + }, + { + "epoch": 6.917731629392971, + "grad_norm": 0.36212611198425293, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 8661 + }, + { + "epoch": 6.9185303514377, + "grad_norm": 0.13674713671207428, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8662 + }, + { + "epoch": 6.919329073482428, + "grad_norm": 0.24960070848464966, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 8663 + }, + { + "epoch": 6.920127795527156, + "grad_norm": 0.16797062754631042, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8664 + }, + { + "epoch": 6.9209265175718855, + "grad_norm": 0.23811157047748566, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 8665 + }, + { + "epoch": 6.921725239616613, + "grad_norm": 0.25372570753097534, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8666 + }, + { + "epoch": 6.922523961661342, + "grad_norm": 0.13954615592956543, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8667 + }, + { + "epoch": 6.92332268370607, + "grad_norm": 0.17769959568977356, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8668 + }, + { + "epoch": 6.924121405750799, + "grad_norm": 0.14327546954154968, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 8669 + }, + { + "epoch": 6.924920127795527, + "grad_norm": 0.07454083859920502, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 8670 + }, + { + "epoch": 6.925718849840256, + "grad_norm": 0.18561266362667084, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8671 + }, + { + "epoch": 6.926517571884984, + "grad_norm": 0.11927005648612976, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 8672 + }, + { + "epoch": 6.927316293929713, + "grad_norm": 0.06790865212678909, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8673 + }, + { + "epoch": 6.928115015974441, + "grad_norm": 0.22627630829811096, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8674 + }, + { + "epoch": 6.928913738019169, + "grad_norm": 0.21341092884540558, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8675 + }, + { + "epoch": 6.9297124600638975, + "grad_norm": 0.19292457401752472, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 8676 + }, + { + "epoch": 6.930511182108626, + "grad_norm": 0.15046356618404388, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8677 + }, + { + "epoch": 6.931309904153355, + "grad_norm": 0.13845203816890717, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8678 + }, + { + "epoch": 6.932108626198083, + "grad_norm": 0.18034739792346954, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8679 + }, + { + "epoch": 6.932907348242812, + "grad_norm": 0.3970269560813904, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8680 + }, + { + "epoch": 6.93370607028754, + "grad_norm": 0.133075550198555, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8681 + }, + { + "epoch": 6.934504792332269, + "grad_norm": 0.13149690628051758, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 8682 + }, + { + "epoch": 6.935303514376997, + "grad_norm": 0.1332010179758072, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8683 + }, + { + "epoch": 6.936102236421725, + "grad_norm": 0.13125883042812347, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 8684 + }, + { + "epoch": 6.936900958466453, + "grad_norm": 0.5500382781028748, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 8685 + }, + { + "epoch": 6.937699680511182, + "grad_norm": 0.09766851365566254, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8686 + }, + { + "epoch": 6.93849840255591, + "grad_norm": 0.10732626169919968, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8687 + }, + { + "epoch": 6.939297124600639, + "grad_norm": 0.10059154033660889, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8688 + }, + { + "epoch": 6.9400958466453675, + "grad_norm": 0.09518695622682571, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8689 + }, + { + "epoch": 6.940894568690096, + "grad_norm": 0.1279720813035965, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8690 + }, + { + "epoch": 6.9416932907348246, + "grad_norm": 0.0997946485877037, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8691 + }, + { + "epoch": 6.942492012779553, + "grad_norm": 0.08584152907133102, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8692 + }, + { + "epoch": 6.943290734824281, + "grad_norm": 0.06987651437520981, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8693 + }, + { + "epoch": 6.944089456869009, + "grad_norm": 0.10446512699127197, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8694 + }, + { + "epoch": 6.944888178913738, + "grad_norm": 0.08535288274288177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8695 + }, + { + "epoch": 6.945686900958466, + "grad_norm": 0.15912187099456787, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8696 + }, + { + "epoch": 6.946485623003195, + "grad_norm": 0.20139484107494354, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8697 + }, + { + "epoch": 6.947284345047923, + "grad_norm": 0.10153409093618393, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8698 + }, + { + "epoch": 6.948083067092652, + "grad_norm": 0.04925902560353279, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8699 + }, + { + "epoch": 6.94888178913738, + "grad_norm": 0.13896742463111877, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8700 + }, + { + "epoch": 6.949680511182109, + "grad_norm": 0.07297761738300323, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8701 + }, + { + "epoch": 6.950479233226837, + "grad_norm": 0.09260845929384232, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8702 + }, + { + "epoch": 6.951277955271565, + "grad_norm": 0.11840535700321198, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8703 + }, + { + "epoch": 6.952076677316294, + "grad_norm": 0.17365501821041107, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8704 + }, + { + "epoch": 6.952875399361022, + "grad_norm": 0.1369183212518692, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8705 + }, + { + "epoch": 6.953674121405751, + "grad_norm": 0.11277196556329727, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8706 + }, + { + "epoch": 6.954472843450479, + "grad_norm": 0.11032512784004211, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8707 + }, + { + "epoch": 6.955271565495208, + "grad_norm": 0.12437347322702408, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8708 + }, + { + "epoch": 6.956070287539936, + "grad_norm": 0.08772306144237518, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8709 + }, + { + "epoch": 6.956869009584665, + "grad_norm": 0.05245213583111763, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8710 + }, + { + "epoch": 6.957667731629393, + "grad_norm": 0.1591174304485321, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8711 + }, + { + "epoch": 6.958466453674122, + "grad_norm": 0.21121510863304138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8712 + }, + { + "epoch": 6.9592651757188495, + "grad_norm": 0.11379709839820862, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8713 + }, + { + "epoch": 6.960063897763578, + "grad_norm": 0.10083793848752975, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8714 + }, + { + "epoch": 6.960862619808307, + "grad_norm": 0.0790674164891243, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8715 + }, + { + "epoch": 6.961661341853035, + "grad_norm": 0.13917089998722076, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8716 + }, + { + "epoch": 6.962460063897764, + "grad_norm": 0.18794408440589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8717 + }, + { + "epoch": 6.963258785942492, + "grad_norm": 0.10725098103284836, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8718 + }, + { + "epoch": 6.964057507987221, + "grad_norm": 0.14577186107635498, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8719 + }, + { + "epoch": 6.964856230031949, + "grad_norm": 0.06711703538894653, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8720 + }, + { + "epoch": 6.965654952076678, + "grad_norm": 0.20572635531425476, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8721 + }, + { + "epoch": 6.966453674121405, + "grad_norm": 0.13693936169147491, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8722 + }, + { + "epoch": 6.967252396166134, + "grad_norm": 0.05642275512218475, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8723 + }, + { + "epoch": 6.968051118210862, + "grad_norm": 0.09080768376588821, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8724 + }, + { + "epoch": 6.968849840255591, + "grad_norm": 0.05295126140117645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8725 + }, + { + "epoch": 6.9696485623003195, + "grad_norm": 0.11833932250738144, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8726 + }, + { + "epoch": 6.970447284345048, + "grad_norm": 0.12110085785388947, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8727 + }, + { + "epoch": 6.9712460063897765, + "grad_norm": 0.10044527053833008, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8728 + }, + { + "epoch": 6.972044728434505, + "grad_norm": 0.13638640940189362, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8729 + }, + { + "epoch": 6.972843450479234, + "grad_norm": 0.18118594586849213, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8730 + }, + { + "epoch": 6.973642172523961, + "grad_norm": 0.1394396871328354, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8731 + }, + { + "epoch": 6.97444089456869, + "grad_norm": 0.14276480674743652, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8732 + }, + { + "epoch": 6.975239616613418, + "grad_norm": 0.2213817834854126, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8733 + }, + { + "epoch": 6.976038338658147, + "grad_norm": 0.11497826874256134, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8734 + }, + { + "epoch": 6.976837060702875, + "grad_norm": 0.11436138302087784, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8735 + }, + { + "epoch": 6.977635782747604, + "grad_norm": 0.08433762192726135, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 8736 + }, + { + "epoch": 6.978434504792332, + "grad_norm": 0.1584242880344391, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8737 + }, + { + "epoch": 6.979233226837061, + "grad_norm": 0.09111067652702332, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8738 + }, + { + "epoch": 6.9800319488817895, + "grad_norm": 0.09075064212083817, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8739 + }, + { + "epoch": 6.980830670926517, + "grad_norm": 0.08456333726644516, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8740 + }, + { + "epoch": 6.981629392971246, + "grad_norm": 0.08090690523386002, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8741 + }, + { + "epoch": 6.982428115015974, + "grad_norm": 0.42019179463386536, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8742 + }, + { + "epoch": 6.983226837060703, + "grad_norm": 0.119536854326725, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8743 + }, + { + "epoch": 6.984025559105431, + "grad_norm": 0.08138761669397354, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8744 + }, + { + "epoch": 6.98482428115016, + "grad_norm": 0.5337278246879578, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8745 + }, + { + "epoch": 6.985623003194888, + "grad_norm": 0.1773308366537094, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8746 + }, + { + "epoch": 6.986421725239617, + "grad_norm": 0.10939478129148483, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8747 + }, + { + "epoch": 6.987220447284345, + "grad_norm": 0.18635793030261993, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8748 + }, + { + "epoch": 6.988019169329074, + "grad_norm": 0.11675454676151276, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8749 + }, + { + "epoch": 6.988817891373802, + "grad_norm": 0.11787068843841553, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8750 + }, + { + "epoch": 6.98961661341853, + "grad_norm": 0.2457057386636734, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8751 + }, + { + "epoch": 6.9904153354632586, + "grad_norm": 0.05914906784892082, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 8752 + }, + { + "epoch": 6.991214057507987, + "grad_norm": 0.1494094878435135, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8753 + }, + { + "epoch": 6.992012779552716, + "grad_norm": 0.14485910534858704, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8754 + }, + { + "epoch": 6.992811501597444, + "grad_norm": 1.2348047494888306, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8755 + }, + { + "epoch": 6.993610223642173, + "grad_norm": 0.1546175330877304, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8756 + }, + { + "epoch": 6.994408945686901, + "grad_norm": 0.13474640250205994, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8757 + }, + { + "epoch": 6.99520766773163, + "grad_norm": 0.5535407662391663, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8758 + }, + { + "epoch": 6.996006389776358, + "grad_norm": 0.10516832023859024, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8759 + }, + { + "epoch": 6.996805111821086, + "grad_norm": 0.07872752100229263, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8760 + }, + { + "epoch": 6.997603833865814, + "grad_norm": 0.08130715042352676, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8761 + }, + { + "epoch": 6.998402555910543, + "grad_norm": 0.09496142715215683, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8762 + }, + { + "epoch": 6.9992012779552715, + "grad_norm": 0.06645053625106812, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8763 + }, + { + "epoch": 7.0, + "grad_norm": 0.07332758605480194, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8764 + }, + { + "epoch": 7.0007987220447285, + "grad_norm": 0.09108536690473557, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8765 + }, + { + "epoch": 7.001597444089457, + "grad_norm": 0.13202883303165436, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8766 + }, + { + "epoch": 7.002396166134186, + "grad_norm": 0.09079252928495407, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 8767 + }, + { + "epoch": 7.003194888178914, + "grad_norm": 0.1004822626709938, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8768 + }, + { + "epoch": 7.003993610223642, + "grad_norm": 0.05096781253814697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8769 + }, + { + "epoch": 7.00479233226837, + "grad_norm": 0.14213396608829498, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8770 + }, + { + "epoch": 7.005591054313099, + "grad_norm": 0.11614344269037247, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8771 + }, + { + "epoch": 7.006389776357827, + "grad_norm": 0.1144147664308548, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8772 + }, + { + "epoch": 7.007188498402556, + "grad_norm": 0.1504330188035965, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8773 + }, + { + "epoch": 7.007987220447284, + "grad_norm": 0.10443079471588135, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8774 + }, + { + "epoch": 7.008785942492013, + "grad_norm": 0.166890949010849, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8775 + }, + { + "epoch": 7.0095846645367414, + "grad_norm": 0.12496565282344818, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8776 + }, + { + "epoch": 7.01038338658147, + "grad_norm": 0.12851381301879883, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8777 + }, + { + "epoch": 7.0111821086261985, + "grad_norm": 0.20198717713356018, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8778 + }, + { + "epoch": 7.011980830670926, + "grad_norm": 0.10324864089488983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8779 + }, + { + "epoch": 7.012779552715655, + "grad_norm": 0.12864094972610474, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8780 + }, + { + "epoch": 7.013578274760383, + "grad_norm": 0.11301549524068832, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8781 + }, + { + "epoch": 7.014376996805112, + "grad_norm": 0.13162367045879364, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8782 + }, + { + "epoch": 7.01517571884984, + "grad_norm": 0.1574760377407074, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8783 + }, + { + "epoch": 7.015974440894569, + "grad_norm": 0.07471634447574615, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8784 + }, + { + "epoch": 7.016773162939297, + "grad_norm": 0.09653516113758087, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8785 + }, + { + "epoch": 7.017571884984026, + "grad_norm": 0.13719993829727173, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8786 + }, + { + "epoch": 7.018370607028754, + "grad_norm": 0.10545443743467331, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8787 + }, + { + "epoch": 7.019169329073482, + "grad_norm": 0.1147511675953865, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8788 + }, + { + "epoch": 7.0199680511182105, + "grad_norm": 0.14005234837532043, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8789 + }, + { + "epoch": 7.020766773162939, + "grad_norm": 0.36956554651260376, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8790 + }, + { + "epoch": 7.021565495207668, + "grad_norm": 0.1384177953004837, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8791 + }, + { + "epoch": 7.022364217252396, + "grad_norm": 0.062106356024742126, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8792 + }, + { + "epoch": 7.023162939297125, + "grad_norm": 0.14074385166168213, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8793 + }, + { + "epoch": 7.023961661341853, + "grad_norm": 0.18152809143066406, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8794 + }, + { + "epoch": 7.024760383386582, + "grad_norm": 0.11607832461595535, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8795 + }, + { + "epoch": 7.02555910543131, + "grad_norm": 0.06603241711854935, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8796 + }, + { + "epoch": 7.026357827476039, + "grad_norm": 0.08846289664506912, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8797 + }, + { + "epoch": 7.027156549520766, + "grad_norm": 0.09882134944200516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8798 + }, + { + "epoch": 7.027955271565495, + "grad_norm": 0.11535032093524933, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8799 + }, + { + "epoch": 7.0287539936102235, + "grad_norm": 0.10153281688690186, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8800 + }, + { + "epoch": 7.029552715654952, + "grad_norm": 0.11195418983697891, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8801 + }, + { + "epoch": 7.0303514376996805, + "grad_norm": 0.5721603035926819, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8802 + }, + { + "epoch": 7.031150159744409, + "grad_norm": 0.18006286025047302, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8803 + }, + { + "epoch": 7.031948881789138, + "grad_norm": 0.16561086475849152, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8804 + }, + { + "epoch": 7.032747603833866, + "grad_norm": 0.11010444164276123, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8805 + }, + { + "epoch": 7.033546325878595, + "grad_norm": 0.17741475999355316, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8806 + }, + { + "epoch": 7.034345047923322, + "grad_norm": 0.09941161423921585, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 8807 + }, + { + "epoch": 7.035143769968051, + "grad_norm": 0.20474617183208466, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8808 + }, + { + "epoch": 7.035942492012779, + "grad_norm": 0.07972154021263123, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8809 + }, + { + "epoch": 7.036741214057508, + "grad_norm": 0.17856109142303467, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8810 + }, + { + "epoch": 7.037539936102236, + "grad_norm": 0.1276514083147049, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8811 + }, + { + "epoch": 7.038338658146965, + "grad_norm": 0.08009849488735199, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 8812 + }, + { + "epoch": 7.039137380191693, + "grad_norm": 0.09832913428544998, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8813 + }, + { + "epoch": 7.039936102236422, + "grad_norm": 0.06454402953386307, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8814 + }, + { + "epoch": 7.0407348242811505, + "grad_norm": 0.20843401551246643, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8815 + }, + { + "epoch": 7.041533546325879, + "grad_norm": 0.14909301698207855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8816 + }, + { + "epoch": 7.042332268370607, + "grad_norm": 0.08815812319517136, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8817 + }, + { + "epoch": 7.043130990415335, + "grad_norm": 0.18957766890525818, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8818 + }, + { + "epoch": 7.043929712460064, + "grad_norm": 0.33018213510513306, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8819 + }, + { + "epoch": 7.044728434504792, + "grad_norm": 0.11069374531507492, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8820 + }, + { + "epoch": 7.045527156549521, + "grad_norm": 0.3001084625720978, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8821 + }, + { + "epoch": 7.046325878594249, + "grad_norm": 0.0704922303557396, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8822 + }, + { + "epoch": 7.047124600638978, + "grad_norm": 0.08537211269140244, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8823 + }, + { + "epoch": 7.047923322683706, + "grad_norm": 0.08765899389982224, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8824 + }, + { + "epoch": 7.048722044728435, + "grad_norm": 0.14218255877494812, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8825 + }, + { + "epoch": 7.0495207667731625, + "grad_norm": 0.08026671409606934, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8826 + }, + { + "epoch": 7.050319488817891, + "grad_norm": 0.07170549035072327, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8827 + }, + { + "epoch": 7.05111821086262, + "grad_norm": 1.2578401565551758, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8828 + }, + { + "epoch": 7.051916932907348, + "grad_norm": 0.20149891078472137, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8829 + }, + { + "epoch": 7.052715654952077, + "grad_norm": 0.18734677135944366, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8830 + }, + { + "epoch": 7.053514376996805, + "grad_norm": 0.08732877671718597, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8831 + }, + { + "epoch": 7.054313099041534, + "grad_norm": 0.1895754486322403, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8832 + }, + { + "epoch": 7.055111821086262, + "grad_norm": 0.06839644908905029, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8833 + }, + { + "epoch": 7.055910543130991, + "grad_norm": 4.666222095489502, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8834 + }, + { + "epoch": 7.056709265175719, + "grad_norm": 0.2801821231842041, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8835 + }, + { + "epoch": 7.057507987220447, + "grad_norm": 0.3428499102592468, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8836 + }, + { + "epoch": 7.0583067092651754, + "grad_norm": 0.16896478831768036, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8837 + }, + { + "epoch": 7.059105431309904, + "grad_norm": 1.21062171459198, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8838 + }, + { + "epoch": 7.0599041533546325, + "grad_norm": 0.20507270097732544, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8839 + }, + { + "epoch": 7.060702875399361, + "grad_norm": 0.34736308455467224, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8840 + }, + { + "epoch": 7.06150159744409, + "grad_norm": 0.13628798723220825, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8841 + }, + { + "epoch": 7.062300319488818, + "grad_norm": 0.3212411403656006, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8842 + }, + { + "epoch": 7.063099041533547, + "grad_norm": 0.23049144446849823, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8843 + }, + { + "epoch": 7.063897763578275, + "grad_norm": 0.2785285413265228, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8844 + }, + { + "epoch": 7.064696485623003, + "grad_norm": 0.32158368825912476, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8845 + }, + { + "epoch": 7.065495207667731, + "grad_norm": 0.40443500876426697, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8846 + }, + { + "epoch": 7.06629392971246, + "grad_norm": 0.20072752237319946, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8847 + }, + { + "epoch": 7.067092651757188, + "grad_norm": 0.38166266679763794, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8848 + }, + { + "epoch": 7.067891373801917, + "grad_norm": 0.2771472930908203, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8849 + }, + { + "epoch": 7.068690095846645, + "grad_norm": 0.10485964268445969, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8850 + }, + { + "epoch": 7.069488817891374, + "grad_norm": 0.17424215376377106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8851 + }, + { + "epoch": 7.0702875399361025, + "grad_norm": 0.0972314327955246, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8852 + }, + { + "epoch": 7.071086261980831, + "grad_norm": 0.18021832406520844, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8853 + }, + { + "epoch": 7.0718849840255595, + "grad_norm": 0.08820143342018127, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8854 + }, + { + "epoch": 7.072683706070287, + "grad_norm": 0.1785898506641388, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8855 + }, + { + "epoch": 7.073482428115016, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8856 + }, + { + "epoch": 7.074281150159744, + "grad_norm": 0.1787438541650772, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8857 + }, + { + "epoch": 7.075079872204473, + "grad_norm": 0.16761353611946106, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8858 + }, + { + "epoch": 7.075878594249201, + "grad_norm": 0.5075165629386902, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8859 + }, + { + "epoch": 7.07667731629393, + "grad_norm": 0.13462364673614502, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8860 + }, + { + "epoch": 7.077476038338658, + "grad_norm": 0.20478707551956177, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8861 + }, + { + "epoch": 7.078274760383387, + "grad_norm": 0.14689947664737701, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8862 + }, + { + "epoch": 7.079073482428115, + "grad_norm": 0.36265847086906433, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8863 + }, + { + "epoch": 7.079872204472843, + "grad_norm": 0.18443043529987335, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8864 + }, + { + "epoch": 7.080670926517572, + "grad_norm": 0.04789111018180847, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8865 + }, + { + "epoch": 7.0814696485623, + "grad_norm": 0.18024222552776337, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8866 + }, + { + "epoch": 7.082268370607029, + "grad_norm": 0.08901690691709518, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8867 + }, + { + "epoch": 7.083067092651757, + "grad_norm": 0.20689153671264648, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8868 + }, + { + "epoch": 7.083865814696486, + "grad_norm": 0.15572768449783325, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8869 + }, + { + "epoch": 7.084664536741214, + "grad_norm": 0.2915050685405731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8870 + }, + { + "epoch": 7.085463258785943, + "grad_norm": 0.12404290586709976, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8871 + }, + { + "epoch": 7.086261980830671, + "grad_norm": 0.19628335535526276, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8872 + }, + { + "epoch": 7.0870607028754, + "grad_norm": 0.6693617105484009, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8873 + }, + { + "epoch": 7.087859424920127, + "grad_norm": 0.21526481211185455, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8874 + }, + { + "epoch": 7.088658146964856, + "grad_norm": 0.2779954969882965, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8875 + }, + { + "epoch": 7.0894568690095845, + "grad_norm": 0.14111320674419403, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8876 + }, + { + "epoch": 7.090255591054313, + "grad_norm": 0.26465079188346863, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8877 + }, + { + "epoch": 7.0910543130990416, + "grad_norm": 0.12354349344968796, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8878 + }, + { + "epoch": 7.09185303514377, + "grad_norm": 0.18360896408557892, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8879 + }, + { + "epoch": 7.092651757188499, + "grad_norm": 0.26844218373298645, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8880 + }, + { + "epoch": 7.093450479233227, + "grad_norm": 0.34032055735588074, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8881 + }, + { + "epoch": 7.094249201277956, + "grad_norm": 0.2372630089521408, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8882 + }, + { + "epoch": 7.095047923322683, + "grad_norm": 0.4134571850299835, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8883 + }, + { + "epoch": 7.095846645367412, + "grad_norm": 0.21220949292182922, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8884 + }, + { + "epoch": 7.09664536741214, + "grad_norm": 0.20073527097702026, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8885 + }, + { + "epoch": 7.097444089456869, + "grad_norm": 0.1583309918642044, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8886 + }, + { + "epoch": 7.098242811501597, + "grad_norm": 0.4032151401042938, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8887 + }, + { + "epoch": 7.099041533546326, + "grad_norm": 0.09527560323476791, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 8888 + }, + { + "epoch": 7.0998402555910545, + "grad_norm": 0.2630043625831604, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8889 + }, + { + "epoch": 7.100638977635783, + "grad_norm": 0.06699138134717941, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8890 + }, + { + "epoch": 7.1014376996805115, + "grad_norm": 0.34307003021240234, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8891 + }, + { + "epoch": 7.102236421725239, + "grad_norm": 0.24538451433181763, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8892 + }, + { + "epoch": 7.103035143769968, + "grad_norm": 0.2794513702392578, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8893 + }, + { + "epoch": 7.103833865814696, + "grad_norm": 0.20586012303829193, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8894 + }, + { + "epoch": 7.104632587859425, + "grad_norm": 0.22349807620048523, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8895 + }, + { + "epoch": 7.105431309904153, + "grad_norm": 0.31171584129333496, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8896 + }, + { + "epoch": 7.106230031948882, + "grad_norm": 0.07461030781269073, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8897 + }, + { + "epoch": 7.10702875399361, + "grad_norm": 0.24280597269535065, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8898 + }, + { + "epoch": 7.107827476038339, + "grad_norm": 0.13005708158016205, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8899 + }, + { + "epoch": 7.108626198083067, + "grad_norm": 0.24730080366134644, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8900 + }, + { + "epoch": 7.109424920127796, + "grad_norm": 1.287341833114624, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8901 + }, + { + "epoch": 7.110223642172524, + "grad_norm": 0.15945735573768616, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8902 + }, + { + "epoch": 7.111022364217252, + "grad_norm": 0.09943541884422302, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8903 + }, + { + "epoch": 7.111821086261981, + "grad_norm": 0.12183468043804169, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8904 + }, + { + "epoch": 7.112619808306709, + "grad_norm": 0.11859191954135895, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8905 + }, + { + "epoch": 7.113418530351438, + "grad_norm": 0.27701425552368164, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8906 + }, + { + "epoch": 7.114217252396166, + "grad_norm": 0.14724725484848022, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8907 + }, + { + "epoch": 7.115015974440895, + "grad_norm": 0.1342400461435318, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8908 + }, + { + "epoch": 7.115814696485623, + "grad_norm": 0.15474970638751984, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8909 + }, + { + "epoch": 7.116613418530352, + "grad_norm": 0.1276721954345703, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8910 + }, + { + "epoch": 7.11741214057508, + "grad_norm": 0.14511124789714813, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8911 + }, + { + "epoch": 7.118210862619808, + "grad_norm": 0.10112027823925018, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8912 + }, + { + "epoch": 7.1190095846645365, + "grad_norm": 0.17296795547008514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8913 + }, + { + "epoch": 7.119808306709265, + "grad_norm": 0.09542828798294067, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8914 + }, + { + "epoch": 7.1206070287539935, + "grad_norm": 0.17453183233737946, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8915 + }, + { + "epoch": 7.121405750798722, + "grad_norm": 0.13417603075504303, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8916 + }, + { + "epoch": 7.122204472843451, + "grad_norm": 0.26239508390426636, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8917 + }, + { + "epoch": 7.123003194888179, + "grad_norm": 0.13963834941387177, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8918 + }, + { + "epoch": 7.123801916932908, + "grad_norm": 0.18642054498195648, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8919 + }, + { + "epoch": 7.124600638977636, + "grad_norm": 0.17754590511322021, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8920 + }, + { + "epoch": 7.125399361022364, + "grad_norm": 0.1010628268122673, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8921 + }, + { + "epoch": 7.126198083067092, + "grad_norm": 0.1621905416250229, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8922 + }, + { + "epoch": 7.126996805111821, + "grad_norm": 0.3069966733455658, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8923 + }, + { + "epoch": 7.127795527156549, + "grad_norm": 0.2312333881855011, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8924 + }, + { + "epoch": 7.128594249201278, + "grad_norm": 0.20297785103321075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8925 + }, + { + "epoch": 7.1293929712460065, + "grad_norm": 0.18856601417064667, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8926 + }, + { + "epoch": 7.130191693290735, + "grad_norm": 0.19353985786437988, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8927 + }, + { + "epoch": 7.1309904153354635, + "grad_norm": 0.08276687562465668, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8928 + }, + { + "epoch": 7.131789137380192, + "grad_norm": 0.31372779607772827, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8929 + }, + { + "epoch": 7.13258785942492, + "grad_norm": 0.10208959877490997, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8930 + }, + { + "epoch": 7.133386581469648, + "grad_norm": 0.1636659801006317, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8931 + }, + { + "epoch": 7.134185303514377, + "grad_norm": 0.14321425557136536, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8932 + }, + { + "epoch": 7.134984025559105, + "grad_norm": 0.08438511192798615, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8933 + }, + { + "epoch": 7.135782747603834, + "grad_norm": 0.17451012134552002, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8934 + }, + { + "epoch": 7.136581469648562, + "grad_norm": 0.06913795322179794, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8935 + }, + { + "epoch": 7.137380191693291, + "grad_norm": 0.14176666736602783, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8936 + }, + { + "epoch": 7.138178913738019, + "grad_norm": 0.15005643665790558, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8937 + }, + { + "epoch": 7.138977635782748, + "grad_norm": 0.08884457498788834, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8938 + }, + { + "epoch": 7.139776357827476, + "grad_norm": 0.19651612639427185, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8939 + }, + { + "epoch": 7.140575079872204, + "grad_norm": 0.12419132143259048, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8940 + }, + { + "epoch": 7.141373801916933, + "grad_norm": 0.08800125867128372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8941 + }, + { + "epoch": 7.142172523961661, + "grad_norm": 0.12308578193187714, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8942 + }, + { + "epoch": 7.14297124600639, + "grad_norm": 0.06376682221889496, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8943 + }, + { + "epoch": 7.143769968051118, + "grad_norm": 0.08467467129230499, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8944 + }, + { + "epoch": 7.144568690095847, + "grad_norm": 0.05492696538567543, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8945 + }, + { + "epoch": 7.145367412140575, + "grad_norm": 0.12659363448619843, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8946 + }, + { + "epoch": 7.146166134185304, + "grad_norm": 0.11025204509496689, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8947 + }, + { + "epoch": 7.146964856230032, + "grad_norm": 0.03672007843852043, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8948 + }, + { + "epoch": 7.147763578274761, + "grad_norm": 0.06386546790599823, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8949 + }, + { + "epoch": 7.1485623003194885, + "grad_norm": 0.05484751984477043, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8950 + }, + { + "epoch": 7.149361022364217, + "grad_norm": 0.08663280308246613, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8951 + }, + { + "epoch": 7.1501597444089455, + "grad_norm": 0.10515031963586807, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8952 + }, + { + "epoch": 7.150958466453674, + "grad_norm": 0.05844622105360031, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8953 + }, + { + "epoch": 7.151757188498403, + "grad_norm": 0.061575960367918015, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8954 + }, + { + "epoch": 7.152555910543131, + "grad_norm": 0.30169913172721863, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8955 + }, + { + "epoch": 7.15335463258786, + "grad_norm": 0.15433792769908905, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8956 + }, + { + "epoch": 7.154153354632588, + "grad_norm": 0.11872339993715286, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8957 + }, + { + "epoch": 7.154952076677317, + "grad_norm": 0.4086587131023407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8958 + }, + { + "epoch": 7.155750798722044, + "grad_norm": 0.0976172536611557, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8959 + }, + { + "epoch": 7.156549520766773, + "grad_norm": 0.11132699996232986, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8960 + }, + { + "epoch": 7.157348242811501, + "grad_norm": 0.11129645258188248, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8961 + }, + { + "epoch": 7.15814696485623, + "grad_norm": 0.09004200249910355, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8962 + }, + { + "epoch": 7.1589456869009584, + "grad_norm": 0.1225908026099205, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8963 + }, + { + "epoch": 7.159744408945687, + "grad_norm": 0.10531286895275116, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8964 + }, + { + "epoch": 7.1605431309904155, + "grad_norm": 0.1054515391588211, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8965 + }, + { + "epoch": 7.161341853035144, + "grad_norm": 0.11718834936618805, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8966 + }, + { + "epoch": 7.162140575079873, + "grad_norm": 0.11314168572425842, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8967 + }, + { + "epoch": 7.1629392971246, + "grad_norm": 0.1017487570643425, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8968 + }, + { + "epoch": 7.163738019169329, + "grad_norm": 0.05381032079458237, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8969 + }, + { + "epoch": 7.164536741214057, + "grad_norm": 0.1527879238128662, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8970 + }, + { + "epoch": 7.165335463258786, + "grad_norm": 0.05352415144443512, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8971 + }, + { + "epoch": 7.166134185303514, + "grad_norm": 0.17179784178733826, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8972 + }, + { + "epoch": 7.166932907348243, + "grad_norm": 0.24629469215869904, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8973 + }, + { + "epoch": 7.167731629392971, + "grad_norm": 0.11276146024465561, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8974 + }, + { + "epoch": 7.1685303514377, + "grad_norm": 0.0927032083272934, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8975 + }, + { + "epoch": 7.169329073482428, + "grad_norm": 0.0978626236319542, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8976 + }, + { + "epoch": 7.170127795527157, + "grad_norm": 0.12577946484088898, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8977 + }, + { + "epoch": 7.170926517571885, + "grad_norm": 0.1014678105711937, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8978 + }, + { + "epoch": 7.171725239616613, + "grad_norm": 0.08706190437078476, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8979 + }, + { + "epoch": 7.172523961661342, + "grad_norm": 0.06214338168501854, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8980 + }, + { + "epoch": 7.17332268370607, + "grad_norm": 0.08223161101341248, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8981 + }, + { + "epoch": 7.174121405750799, + "grad_norm": 0.3143157362937927, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8982 + }, + { + "epoch": 7.174920127795527, + "grad_norm": 0.16466212272644043, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8983 + }, + { + "epoch": 7.175718849840256, + "grad_norm": 0.13650043308734894, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8984 + }, + { + "epoch": 7.176517571884984, + "grad_norm": 0.05605694651603699, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8985 + }, + { + "epoch": 7.177316293929713, + "grad_norm": 0.12153269350528717, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8986 + }, + { + "epoch": 7.178115015974441, + "grad_norm": 0.07390844076871872, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8987 + }, + { + "epoch": 7.178913738019169, + "grad_norm": 0.05618416517972946, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8988 + }, + { + "epoch": 7.1797124600638975, + "grad_norm": 0.24178527295589447, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8989 + }, + { + "epoch": 7.180511182108626, + "grad_norm": 0.06414328515529633, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8990 + }, + { + "epoch": 7.181309904153355, + "grad_norm": 0.05483662337064743, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8991 + }, + { + "epoch": 7.182108626198083, + "grad_norm": 0.05821032077074051, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8992 + }, + { + "epoch": 7.182907348242812, + "grad_norm": 0.04972073435783386, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8993 + }, + { + "epoch": 7.18370607028754, + "grad_norm": 0.13323748111724854, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8994 + }, + { + "epoch": 7.184504792332269, + "grad_norm": 0.1341763287782669, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8995 + }, + { + "epoch": 7.185303514376997, + "grad_norm": 0.1092606782913208, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8996 + }, + { + "epoch": 7.186102236421725, + "grad_norm": 0.10611139982938766, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8997 + }, + { + "epoch": 7.186900958466453, + "grad_norm": 0.0810476616024971, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8998 + }, + { + "epoch": 7.187699680511182, + "grad_norm": 0.053938958793878555, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8999 + }, + { + "epoch": 7.18849840255591, + "grad_norm": 0.08355431258678436, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9000 + }, + { + "epoch": 7.189297124600639, + "grad_norm": 0.0719372034072876, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9001 + }, + { + "epoch": 7.1900958466453675, + "grad_norm": 0.0541183203458786, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9002 + }, + { + "epoch": 7.190894568690096, + "grad_norm": 0.08637872338294983, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9003 + }, + { + "epoch": 7.1916932907348246, + "grad_norm": 0.0900801345705986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9004 + }, + { + "epoch": 7.192492012779553, + "grad_norm": 0.08778835088014603, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9005 + }, + { + "epoch": 7.193290734824281, + "grad_norm": 0.13946911692619324, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9006 + }, + { + "epoch": 7.194089456869009, + "grad_norm": 0.20089952647686005, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9007 + }, + { + "epoch": 7.194888178913738, + "grad_norm": 0.20472672581672668, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 9008 + }, + { + "epoch": 7.195686900958466, + "grad_norm": 0.09503829479217529, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9009 + }, + { + "epoch": 7.196485623003195, + "grad_norm": 0.057289477437734604, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9010 + }, + { + "epoch": 7.197284345047923, + "grad_norm": 0.18998531997203827, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9011 + }, + { + "epoch": 7.198083067092652, + "grad_norm": 0.12228010594844818, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9012 + }, + { + "epoch": 7.19888178913738, + "grad_norm": 0.0855637639760971, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9013 + }, + { + "epoch": 7.199680511182109, + "grad_norm": 0.08341407775878906, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9014 + }, + { + "epoch": 7.2004792332268375, + "grad_norm": 0.06806697696447372, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9015 + }, + { + "epoch": 7.201277955271565, + "grad_norm": 0.06730692833662033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9016 + }, + { + "epoch": 7.202076677316294, + "grad_norm": 0.04983438923954964, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9017 + }, + { + "epoch": 7.202875399361022, + "grad_norm": 0.09153386205434799, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9018 + }, + { + "epoch": 7.203674121405751, + "grad_norm": 0.06117153540253639, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9019 + }, + { + "epoch": 7.204472843450479, + "grad_norm": 0.056790344417095184, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9020 + }, + { + "epoch": 7.205271565495208, + "grad_norm": 0.8241305351257324, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9021 + }, + { + "epoch": 7.206070287539936, + "grad_norm": 0.21823863685131073, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9022 + }, + { + "epoch": 7.206869009584665, + "grad_norm": 0.14799124002456665, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9023 + }, + { + "epoch": 7.207667731629393, + "grad_norm": 0.09815513342618942, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9024 + }, + { + "epoch": 7.208466453674121, + "grad_norm": 0.2076011300086975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9025 + }, + { + "epoch": 7.2092651757188495, + "grad_norm": 0.13652865588665009, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9026 + }, + { + "epoch": 7.210063897763578, + "grad_norm": 0.15180739760398865, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9027 + }, + { + "epoch": 7.210862619808307, + "grad_norm": 0.11385779827833176, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9028 + }, + { + "epoch": 7.211661341853035, + "grad_norm": 0.05047432705760002, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9029 + }, + { + "epoch": 7.212460063897764, + "grad_norm": 0.13789398968219757, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9030 + }, + { + "epoch": 7.213258785942492, + "grad_norm": 0.10509981215000153, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9031 + }, + { + "epoch": 7.214057507987221, + "grad_norm": 0.19650724530220032, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9032 + }, + { + "epoch": 7.214856230031949, + "grad_norm": 0.11788946390151978, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9033 + }, + { + "epoch": 7.215654952076678, + "grad_norm": 0.11023712903261185, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9034 + }, + { + "epoch": 7.216453674121405, + "grad_norm": 0.3382134735584259, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9035 + }, + { + "epoch": 7.217252396166134, + "grad_norm": 0.20465348660945892, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9036 + }, + { + "epoch": 7.218051118210862, + "grad_norm": 0.17456264793872833, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9037 + }, + { + "epoch": 7.218849840255591, + "grad_norm": 0.09034306555986404, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9038 + }, + { + "epoch": 7.2196485623003195, + "grad_norm": 0.15296493470668793, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9039 + }, + { + "epoch": 7.220447284345048, + "grad_norm": 0.1379650980234146, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9040 + }, + { + "epoch": 7.2212460063897765, + "grad_norm": 0.20932430028915405, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9041 + }, + { + "epoch": 7.222044728434505, + "grad_norm": 0.09309016168117523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9042 + }, + { + "epoch": 7.222843450479234, + "grad_norm": 0.13084891438484192, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9043 + }, + { + "epoch": 7.223642172523961, + "grad_norm": 0.1435803472995758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9044 + }, + { + "epoch": 7.22444089456869, + "grad_norm": 0.05868425592780113, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9045 + }, + { + "epoch": 7.225239616613418, + "grad_norm": 0.09483210742473602, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9046 + }, + { + "epoch": 7.226038338658147, + "grad_norm": 0.20051591098308563, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9047 + }, + { + "epoch": 7.226837060702875, + "grad_norm": 0.09253975749015808, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9048 + }, + { + "epoch": 7.227635782747604, + "grad_norm": 0.15865609049797058, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9049 + }, + { + "epoch": 7.228434504792332, + "grad_norm": 0.14421933889389038, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9050 + }, + { + "epoch": 7.229233226837061, + "grad_norm": 0.13492006063461304, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9051 + }, + { + "epoch": 7.2300319488817895, + "grad_norm": 0.06581155210733414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9052 + }, + { + "epoch": 7.230830670926518, + "grad_norm": 0.12610170245170593, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9053 + }, + { + "epoch": 7.231629392971246, + "grad_norm": 0.12813681364059448, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9054 + }, + { + "epoch": 7.232428115015974, + "grad_norm": 0.07228157669305801, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9055 + }, + { + "epoch": 7.233226837060703, + "grad_norm": 0.13456740975379944, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9056 + }, + { + "epoch": 7.234025559105431, + "grad_norm": 0.10491029918193817, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9057 + }, + { + "epoch": 7.23482428115016, + "grad_norm": 0.14090387523174286, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9058 + }, + { + "epoch": 7.235623003194888, + "grad_norm": 0.10722684115171432, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9059 + }, + { + "epoch": 7.236421725239617, + "grad_norm": 0.05123287811875343, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9060 + }, + { + "epoch": 7.237220447284345, + "grad_norm": 0.1203593909740448, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9061 + }, + { + "epoch": 7.238019169329074, + "grad_norm": 0.07847320288419724, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9062 + }, + { + "epoch": 7.2388178913738015, + "grad_norm": 0.09621457010507584, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9063 + }, + { + "epoch": 7.23961661341853, + "grad_norm": 0.11915068328380585, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9064 + }, + { + "epoch": 7.2404153354632586, + "grad_norm": 0.18357326090335846, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9065 + }, + { + "epoch": 7.241214057507987, + "grad_norm": 0.06862817704677582, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9066 + }, + { + "epoch": 7.242012779552716, + "grad_norm": 0.05091634392738342, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9067 + }, + { + "epoch": 7.242811501597444, + "grad_norm": 0.09132825583219528, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9068 + }, + { + "epoch": 7.243610223642173, + "grad_norm": 0.11998780816793442, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9069 + }, + { + "epoch": 7.244408945686901, + "grad_norm": 0.0678768903017044, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9070 + }, + { + "epoch": 7.24520766773163, + "grad_norm": 0.19880260527133942, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9071 + }, + { + "epoch": 7.246006389776358, + "grad_norm": 0.06379543989896774, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9072 + }, + { + "epoch": 7.246805111821086, + "grad_norm": 0.06652764976024628, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9073 + }, + { + "epoch": 7.247603833865814, + "grad_norm": 0.10495885461568832, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9074 + }, + { + "epoch": 7.248402555910543, + "grad_norm": 0.14753985404968262, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9075 + }, + { + "epoch": 7.2492012779552715, + "grad_norm": 0.08283182233572006, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9076 + }, + { + "epoch": 7.25, + "grad_norm": 0.1378672569990158, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9077 + }, + { + "epoch": 7.2507987220447285, + "grad_norm": 0.10274125635623932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9078 + }, + { + "epoch": 7.251597444089457, + "grad_norm": 0.09236814826726913, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9079 + }, + { + "epoch": 7.252396166134186, + "grad_norm": 0.07923156023025513, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9080 + }, + { + "epoch": 7.253194888178914, + "grad_norm": 0.2953792214393616, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9081 + }, + { + "epoch": 7.253993610223642, + "grad_norm": 9.043856620788574, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9082 + }, + { + "epoch": 7.25479233226837, + "grad_norm": 60.094329833984375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9083 + }, + { + "epoch": 7.255591054313099, + "grad_norm": 48.363075256347656, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 9084 + }, + { + "epoch": 7.256389776357827, + "grad_norm": 92.13807678222656, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9085 + }, + { + "epoch": 7.257188498402556, + "grad_norm": 71.66429138183594, + "learning_rate": 0.0005, + "loss": 1.1524, + "step": 9086 + }, + { + "epoch": 7.257987220447284, + "grad_norm": 29.742534637451172, + "learning_rate": 0.0005, + "loss": 1.2362, + "step": 9087 + }, + { + "epoch": 7.258785942492013, + "grad_norm": 1.1841496229171753, + "learning_rate": 0.0005, + "loss": 1.4452, + "step": 9088 + }, + { + "epoch": 7.2595846645367414, + "grad_norm": 0.7909824252128601, + "learning_rate": 0.0005, + "loss": 1.3049, + "step": 9089 + }, + { + "epoch": 7.26038338658147, + "grad_norm": 0.796114444732666, + "learning_rate": 0.0005, + "loss": 1.2852, + "step": 9090 + }, + { + "epoch": 7.261182108626198, + "grad_norm": 0.9014440178871155, + "learning_rate": 0.0005, + "loss": 1.2243, + "step": 9091 + }, + { + "epoch": 7.261980830670926, + "grad_norm": 0.5654944777488708, + "learning_rate": 0.0005, + "loss": 1.1462, + "step": 9092 + }, + { + "epoch": 7.262779552715655, + "grad_norm": 1.0784763097763062, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 9093 + }, + { + "epoch": 7.263578274760383, + "grad_norm": 0.9014595150947571, + "learning_rate": 0.0005, + "loss": 1.1629, + "step": 9094 + }, + { + "epoch": 7.264376996805112, + "grad_norm": 0.4847378730773926, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9095 + }, + { + "epoch": 7.26517571884984, + "grad_norm": 0.5493710041046143, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9096 + }, + { + "epoch": 7.265974440894569, + "grad_norm": 1.0691193342208862, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 9097 + }, + { + "epoch": 7.266773162939297, + "grad_norm": 2.062331199645996, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 9098 + }, + { + "epoch": 7.267571884984026, + "grad_norm": 2.778977632522583, + "learning_rate": 0.0005, + "loss": 1.2775, + "step": 9099 + }, + { + "epoch": 7.268370607028754, + "grad_norm": 0.8807574510574341, + "learning_rate": 0.0005, + "loss": 1.2851, + "step": 9100 + }, + { + "epoch": 7.269169329073483, + "grad_norm": 1.0370792150497437, + "learning_rate": 0.0005, + "loss": 1.1677, + "step": 9101 + }, + { + "epoch": 7.2699680511182105, + "grad_norm": 0.5272591710090637, + "learning_rate": 0.0005, + "loss": 1.1754, + "step": 9102 + }, + { + "epoch": 7.270766773162939, + "grad_norm": 0.5510113835334778, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 9103 + }, + { + "epoch": 7.271565495207668, + "grad_norm": 0.4650730490684509, + "learning_rate": 0.0005, + "loss": 1.1741, + "step": 9104 + }, + { + "epoch": 7.272364217252396, + "grad_norm": 1.071080207824707, + "learning_rate": 0.0005, + "loss": 1.1418, + "step": 9105 + }, + { + "epoch": 7.273162939297125, + "grad_norm": 0.32088524103164673, + "learning_rate": 0.0005, + "loss": 1.1304, + "step": 9106 + }, + { + "epoch": 7.273961661341853, + "grad_norm": 1.2110369205474854, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 9107 + }, + { + "epoch": 7.274760383386582, + "grad_norm": 0.8781233429908752, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 9108 + }, + { + "epoch": 7.27555910543131, + "grad_norm": 0.356841117143631, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9109 + }, + { + "epoch": 7.276357827476039, + "grad_norm": 0.41136255860328674, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 9110 + }, + { + "epoch": 7.277156549520766, + "grad_norm": 0.30638960003852844, + "learning_rate": 0.0005, + "loss": 1.1006, + "step": 9111 + }, + { + "epoch": 7.277955271565495, + "grad_norm": 0.3056134879589081, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 9112 + }, + { + "epoch": 7.2787539936102235, + "grad_norm": 0.3053964376449585, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 9113 + }, + { + "epoch": 7.279552715654952, + "grad_norm": 0.2799919843673706, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 9114 + }, + { + "epoch": 7.2803514376996805, + "grad_norm": 0.19091907143592834, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 9115 + }, + { + "epoch": 7.281150159744409, + "grad_norm": 0.19973579049110413, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 9116 + }, + { + "epoch": 7.281948881789138, + "grad_norm": 0.21867726743221283, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 9117 + }, + { + "epoch": 7.282747603833866, + "grad_norm": 0.10351689904928207, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 9118 + }, + { + "epoch": 7.283546325878595, + "grad_norm": 0.16956113278865814, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 9119 + }, + { + "epoch": 7.284345047923322, + "grad_norm": 0.2959003150463104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 9120 + }, + { + "epoch": 7.285143769968051, + "grad_norm": 0.18194587528705597, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 9121 + }, + { + "epoch": 7.285942492012779, + "grad_norm": 0.10713140666484833, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 9122 + }, + { + "epoch": 7.286741214057508, + "grad_norm": 0.2391309142112732, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9123 + }, + { + "epoch": 7.287539936102236, + "grad_norm": 0.25640085339546204, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 9124 + }, + { + "epoch": 7.288338658146965, + "grad_norm": 0.25697845220565796, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9125 + }, + { + "epoch": 7.289137380191693, + "grad_norm": 0.2679392695426941, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 9126 + }, + { + "epoch": 7.289936102236422, + "grad_norm": 0.3405737280845642, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9127 + }, + { + "epoch": 7.2907348242811505, + "grad_norm": 0.31081417202949524, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 9128 + }, + { + "epoch": 7.291533546325878, + "grad_norm": 0.15159590542316437, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 9129 + }, + { + "epoch": 7.292332268370607, + "grad_norm": 1.1609382629394531, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 9130 + }, + { + "epoch": 7.293130990415335, + "grad_norm": 0.5588571429252625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 9131 + }, + { + "epoch": 7.293929712460064, + "grad_norm": 0.47076234221458435, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9132 + }, + { + "epoch": 7.294728434504792, + "grad_norm": 1.184756875038147, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 9133 + }, + { + "epoch": 7.295527156549521, + "grad_norm": 0.40956422686576843, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9134 + }, + { + "epoch": 7.296325878594249, + "grad_norm": 0.8017024397850037, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 9135 + }, + { + "epoch": 7.297124600638978, + "grad_norm": 0.29993146657943726, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9136 + }, + { + "epoch": 7.297923322683706, + "grad_norm": 0.4549245238304138, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 9137 + }, + { + "epoch": 7.298722044728435, + "grad_norm": 0.26366063952445984, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 9138 + }, + { + "epoch": 7.2995207667731625, + "grad_norm": 0.3126361668109894, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 9139 + }, + { + "epoch": 7.300319488817891, + "grad_norm": 0.18184784054756165, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 9140 + }, + { + "epoch": 7.30111821086262, + "grad_norm": 0.91683429479599, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 9141 + }, + { + "epoch": 7.301916932907348, + "grad_norm": 3.3384642601013184, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9142 + }, + { + "epoch": 7.302715654952077, + "grad_norm": 0.21734145283699036, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 9143 + }, + { + "epoch": 7.303514376996805, + "grad_norm": 0.13850291073322296, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9144 + }, + { + "epoch": 7.304313099041534, + "grad_norm": 0.1737629920244217, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 9145 + }, + { + "epoch": 7.305111821086262, + "grad_norm": 0.3947316110134125, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 9146 + }, + { + "epoch": 7.305910543130991, + "grad_norm": 0.16360799968242645, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9147 + }, + { + "epoch": 7.306709265175719, + "grad_norm": 0.14816711843013763, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9148 + }, + { + "epoch": 7.307507987220447, + "grad_norm": 0.13554179668426514, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9149 + }, + { + "epoch": 7.3083067092651754, + "grad_norm": 0.10308978706598282, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9150 + }, + { + "epoch": 7.309105431309904, + "grad_norm": 0.11216582357883453, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 9151 + }, + { + "epoch": 7.3099041533546325, + "grad_norm": 0.08531700819730759, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9152 + }, + { + "epoch": 7.310702875399361, + "grad_norm": 0.10261841118335724, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9153 + }, + { + "epoch": 7.31150159744409, + "grad_norm": 0.18318074941635132, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9154 + }, + { + "epoch": 7.312300319488818, + "grad_norm": 0.1616939902305603, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9155 + }, + { + "epoch": 7.313099041533547, + "grad_norm": 0.10412739217281342, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9156 + }, + { + "epoch": 7.313897763578275, + "grad_norm": 0.14097075164318085, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9157 + }, + { + "epoch": 7.314696485623003, + "grad_norm": 0.2168329358100891, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 9158 + }, + { + "epoch": 7.315495207667731, + "grad_norm": 0.14337286353111267, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 9159 + }, + { + "epoch": 7.31629392971246, + "grad_norm": 0.10328586399555206, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9160 + }, + { + "epoch": 7.317092651757188, + "grad_norm": 0.15820610523223877, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9161 + }, + { + "epoch": 7.317891373801917, + "grad_norm": 0.11771009862422943, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9162 + }, + { + "epoch": 7.318690095846645, + "grad_norm": 0.06801208108663559, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9163 + }, + { + "epoch": 7.319488817891374, + "grad_norm": 0.08691044896841049, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9164 + }, + { + "epoch": 7.3202875399361025, + "grad_norm": 0.10149878263473511, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9165 + }, + { + "epoch": 7.321086261980831, + "grad_norm": 0.08544973284006119, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9166 + }, + { + "epoch": 7.321884984025559, + "grad_norm": 0.21312831342220306, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9167 + }, + { + "epoch": 7.322683706070287, + "grad_norm": 0.09866507351398468, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9168 + }, + { + "epoch": 7.323482428115016, + "grad_norm": 0.09676753729581833, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9169 + }, + { + "epoch": 7.324281150159744, + "grad_norm": 0.1783452033996582, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9170 + }, + { + "epoch": 7.325079872204473, + "grad_norm": 0.16399280726909637, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9171 + }, + { + "epoch": 7.325878594249201, + "grad_norm": 0.1160425990819931, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9172 + }, + { + "epoch": 7.32667731629393, + "grad_norm": 0.09826952964067459, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9173 + }, + { + "epoch": 7.327476038338658, + "grad_norm": 0.1292516440153122, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9174 + }, + { + "epoch": 7.328274760383387, + "grad_norm": 0.1253383606672287, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9175 + }, + { + "epoch": 7.329073482428115, + "grad_norm": 0.15330855548381805, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9176 + }, + { + "epoch": 7.329872204472843, + "grad_norm": 0.16339725255966187, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9177 + }, + { + "epoch": 7.330670926517572, + "grad_norm": 0.1716328263282776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9178 + }, + { + "epoch": 7.3314696485623, + "grad_norm": 0.07669667154550552, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9179 + }, + { + "epoch": 7.332268370607029, + "grad_norm": 0.06626272946596146, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 9180 + }, + { + "epoch": 7.333067092651757, + "grad_norm": 0.0935940146446228, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9181 + }, + { + "epoch": 7.333865814696486, + "grad_norm": 0.07840511202812195, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9182 + }, + { + "epoch": 7.334664536741214, + "grad_norm": 0.07776588946580887, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9183 + }, + { + "epoch": 7.335463258785943, + "grad_norm": 0.084624283015728, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9184 + }, + { + "epoch": 7.336261980830671, + "grad_norm": 0.07562167197465897, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9185 + }, + { + "epoch": 7.3370607028754, + "grad_norm": 0.08628194034099579, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9186 + }, + { + "epoch": 7.337859424920127, + "grad_norm": 0.0654950812458992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9187 + }, + { + "epoch": 7.338658146964856, + "grad_norm": 0.06403883546590805, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9188 + }, + { + "epoch": 7.3394568690095845, + "grad_norm": 0.8679103851318359, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9189 + }, + { + "epoch": 7.340255591054313, + "grad_norm": 0.42257770895957947, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 9190 + }, + { + "epoch": 7.3410543130990416, + "grad_norm": 0.3017493486404419, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 9191 + }, + { + "epoch": 7.34185303514377, + "grad_norm": 0.30509164929389954, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 9192 + }, + { + "epoch": 7.342651757188499, + "grad_norm": 0.28457221388816833, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 9193 + }, + { + "epoch": 7.343450479233227, + "grad_norm": 0.2734214961528778, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9194 + }, + { + "epoch": 7.344249201277956, + "grad_norm": 0.2931375801563263, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 9195 + }, + { + "epoch": 7.345047923322683, + "grad_norm": 0.11534975469112396, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9196 + }, + { + "epoch": 7.345846645367412, + "grad_norm": 0.1489555388689041, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 9197 + }, + { + "epoch": 7.34664536741214, + "grad_norm": 0.13024470210075378, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 9198 + }, + { + "epoch": 7.347444089456869, + "grad_norm": 0.1413331776857376, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9199 + }, + { + "epoch": 7.348242811501597, + "grad_norm": 0.07862340658903122, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9200 + }, + { + "epoch": 7.349041533546326, + "grad_norm": 0.0870542973279953, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9201 + }, + { + "epoch": 7.3498402555910545, + "grad_norm": 0.07556174695491791, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 9202 + }, + { + "epoch": 7.350638977635783, + "grad_norm": 0.07381146401166916, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9203 + }, + { + "epoch": 7.3514376996805115, + "grad_norm": 0.5006929636001587, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 9204 + }, + { + "epoch": 7.352236421725239, + "grad_norm": 0.2980809807777405, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 9205 + }, + { + "epoch": 7.353035143769968, + "grad_norm": 0.20632435381412506, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 9206 + }, + { + "epoch": 7.353833865814696, + "grad_norm": 0.2028435915708542, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9207 + }, + { + "epoch": 7.354632587859425, + "grad_norm": 0.220264732837677, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9208 + }, + { + "epoch": 7.355431309904153, + "grad_norm": 0.07175029814243317, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9209 + }, + { + "epoch": 7.356230031948882, + "grad_norm": 0.20052626729011536, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9210 + }, + { + "epoch": 7.35702875399361, + "grad_norm": 0.3549690544605255, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9211 + }, + { + "epoch": 7.357827476038339, + "grad_norm": 0.1310572475194931, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9212 + }, + { + "epoch": 7.358626198083067, + "grad_norm": 0.9551740288734436, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 9213 + }, + { + "epoch": 7.359424920127796, + "grad_norm": 0.13663409650325775, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9214 + }, + { + "epoch": 7.360223642172524, + "grad_norm": 0.11436715722084045, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9215 + }, + { + "epoch": 7.361022364217252, + "grad_norm": 0.10911283642053604, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 9216 + }, + { + "epoch": 7.361821086261981, + "grad_norm": 0.11186671257019043, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9217 + }, + { + "epoch": 7.362619808306709, + "grad_norm": 0.1308698207139969, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9218 + }, + { + "epoch": 7.363418530351438, + "grad_norm": 0.07584013044834137, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9219 + }, + { + "epoch": 7.364217252396166, + "grad_norm": 0.07789483666419983, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 9220 + }, + { + "epoch": 7.365015974440895, + "grad_norm": 0.12758736312389374, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 9221 + }, + { + "epoch": 7.365814696485623, + "grad_norm": 0.09310994297266006, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9222 + }, + { + "epoch": 7.366613418530352, + "grad_norm": 0.14761847257614136, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9223 + }, + { + "epoch": 7.36741214057508, + "grad_norm": 0.8784921169281006, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9224 + }, + { + "epoch": 7.368210862619808, + "grad_norm": 0.07754036784172058, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9225 + }, + { + "epoch": 7.3690095846645365, + "grad_norm": 0.06706640869379044, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9226 + }, + { + "epoch": 7.369808306709265, + "grad_norm": 0.0949360579252243, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9227 + }, + { + "epoch": 7.3706070287539935, + "grad_norm": 0.09635552763938904, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9228 + }, + { + "epoch": 7.371405750798722, + "grad_norm": 0.15888135135173798, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9229 + }, + { + "epoch": 7.372204472843451, + "grad_norm": 0.1487814337015152, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9230 + }, + { + "epoch": 7.373003194888179, + "grad_norm": 0.09755469113588333, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9231 + }, + { + "epoch": 7.373801916932908, + "grad_norm": 0.2550356984138489, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9232 + }, + { + "epoch": 7.374600638977636, + "grad_norm": 0.13796621561050415, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9233 + }, + { + "epoch": 7.375399361022364, + "grad_norm": 0.06727192550897598, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9234 + }, + { + "epoch": 7.376198083067092, + "grad_norm": 0.09111928194761276, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 9235 + }, + { + "epoch": 7.376996805111821, + "grad_norm": 0.15708492696285248, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9236 + }, + { + "epoch": 7.377795527156549, + "grad_norm": 0.06607159227132797, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9237 + }, + { + "epoch": 7.378594249201278, + "grad_norm": 0.3495469391345978, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9238 + }, + { + "epoch": 7.3793929712460065, + "grad_norm": 0.249598890542984, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9239 + }, + { + "epoch": 7.380191693290735, + "grad_norm": 0.1506706029176712, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9240 + }, + { + "epoch": 7.3809904153354635, + "grad_norm": 0.2053573578596115, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9241 + }, + { + "epoch": 7.381789137380192, + "grad_norm": 0.20234468579292297, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9242 + }, + { + "epoch": 7.38258785942492, + "grad_norm": 0.23514828085899353, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9243 + }, + { + "epoch": 7.383386581469648, + "grad_norm": 0.13418453931808472, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9244 + }, + { + "epoch": 7.384185303514377, + "grad_norm": 0.07703951746225357, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9245 + }, + { + "epoch": 7.384984025559105, + "grad_norm": 0.20256030559539795, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 9246 + }, + { + "epoch": 7.385782747603834, + "grad_norm": 0.1140165850520134, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9247 + }, + { + "epoch": 7.386581469648562, + "grad_norm": 0.6283542513847351, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9248 + }, + { + "epoch": 7.387380191693291, + "grad_norm": 0.11779789626598358, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9249 + }, + { + "epoch": 7.388178913738019, + "grad_norm": 0.09821031987667084, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9250 + } + ], + "logging_steps": 1.0, + "max_steps": 751200, + "num_input_tokens_seen": 0, + "num_train_epochs": 600, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.282864558045135e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9250/training_args.bin b/checkpoint-9250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0162074424e3714af8119d3be2b6e69cbb5b9f2 --- /dev/null +++ b/checkpoint-9250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06816c37733f99d23f044cefd981b2f404a72ddf40fa59f794154596b842fa95 +size 6072 diff --git a/checkpoint-9500/config.json b/checkpoint-9500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab0ae1ba49f17c446b66e627e5e96aa2c97bb02d --- /dev/null +++ b/checkpoint-9500/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "checkpoints/vlm_dc-vae-f32c32-sana-1.1_layerwise-0_group-7/checkpoint-9250", + "ar_steps": 1, + "architectures": [ + "DiffVLMDiffusion" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "condition_layer": -1, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "image_token_id": 151655, + "img_cross_attention_dim": 2048, + "img_diffuser_depth": 6, + "img_ffn_dim_multiplier": null, + "img_hidden_size": 1536, + "img_multiple_of": 256, + "img_norm_eps": 1e-05, + "img_num_attention_heads": 12, + "img_num_kv_heads": 12, + "img_qk_norm": true, + "in_channels": 32, + "initializer_range": 0.02, + "inject_img_diffuser": false, + "input_size": 32, + "intermediate_size": 8960, + "layer_group_size": 7, + "layerwise_start_idx": 0, + "lora_alpha": 256, + "lora_bias": "none", + "lora_dropout": 0.05, + "lora_enable": false, + "lora_r": 128, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_vl", + "non_linearity": 1, + "norm_elementwise_affine": true, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "patch_size": 2, + "repa_coeff": 0.1, + "repa_layers": "2", + "repa_shared": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sample_size": 128, + "sampling_steps": 28, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.47.0", + "use_cache": true, + "use_repa": false, + "use_residual_attn": false, + "use_sliding_window": false, + "vae_path": "mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers", + "video_token_id": 151656, + "vision_config": { + "hidden_size": 1536, + "in_chans": 3, + "model_type": "qwen2_vl", + "spatial_patch_size": 14 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/checkpoint-9500/generation_config.json b/checkpoint-9500/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3d465da90cb0ab59d8ba7babdefb0e88fbfa6b --- /dev/null +++ b/checkpoint-9500/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": 151645, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-9500/model-00001-of-00002.safetensors b/checkpoint-9500/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4981ae317592aff7b4a6d8b438838e3d62fd7558 --- /dev/null +++ b/checkpoint-9500/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:027d78d8ebd5a248c07aa08795be021166ca35cae006930ce7a90250222ae329 +size 4998598816 diff --git a/checkpoint-9500/model-00002-of-00002.safetensors b/checkpoint-9500/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..042c86aaf1042341f3332644e13349511c4034f6 --- /dev/null +++ b/checkpoint-9500/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad3d6cc77aa0d87ef09990eae4a583cf5992972c92ba1585efe3005a8f84fb25 +size 4990560652 diff --git a/checkpoint-9500/model.safetensors.index.json b/checkpoint-9500/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..b3b85b852967adb370204fb2c3e3d18822b10ab5 --- /dev/null +++ b/checkpoint-9500/model.safetensors.index.json @@ -0,0 +1,1740 @@ +{ + "metadata": { + "total_size": 9988962252 + }, + "weight_map": { + "embed_tokens.weight": "model-00002-of-00002.safetensors", + "img2text.bias": "model-00001-of-00002.safetensors", + "img2text.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.weight": "model-00001-of-00002.safetensors", + "layers.0.gate": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.1.gate": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.10.gate": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.11.gate": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.12.gate": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.13.gate": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.14.gate": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.15.gate": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.16.gate": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.17.gate": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.17.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.17.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.18.gate": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.19.gate": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.2.gate": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.20.gate": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.21.gate": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.22.gate": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.23.gate": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.24.gate": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.25.gate": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.26.gate": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.27.gate": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.3.gate": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.4.gate": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.5.gate": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.6.gate": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.7.gate": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.8.gate": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.9.gate": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "norm.weight": "model-00002-of-00002.safetensors", + "patch_embedder.proj.bias": "model-00001-of-00002.safetensors", + "patch_embedder.proj.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.weight": "model-00001-of-00002.safetensors", + "vae.decoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-9500/optimizer.pt b/checkpoint-9500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f574a79c9857572ed74b7d6cd9754a5f29445977 --- /dev/null +++ b/checkpoint-9500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffcfe91e3666c374656bf917b8ad4b7ac0da6db13c73d8227f967cead0379e24 +size 15084326534 diff --git a/checkpoint-9500/rng_state_0.pth b/checkpoint-9500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b270c63950cea74fa36a605a753de4c781dbc49 --- /dev/null +++ b/checkpoint-9500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4beb399a9aad04e710e18e8622267c919c8a5c94153adef8d6d164aad7f35fe4 +size 15984 diff --git a/checkpoint-9500/rng_state_1.pth b/checkpoint-9500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..44faf6900580d9840f22f434490d37a95bef3e2f --- /dev/null +++ b/checkpoint-9500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:324837057ced6db286a3516b2a9602e89768f490e46b80d1c76a33b909dcb7da +size 15984 diff --git a/checkpoint-9500/rng_state_2.pth b/checkpoint-9500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..720b998d03a0f75b27f9cc945429e5fe0082023f --- /dev/null +++ b/checkpoint-9500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fc68175f400c4dea4fa448867394a3058a01ffa73f7484cf5878e8566d8ffbc +size 15984 diff --git a/checkpoint-9500/rng_state_3.pth b/checkpoint-9500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..63f3fcf7a8e2bf561c5977631637d1d98fb9261c --- /dev/null +++ b/checkpoint-9500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89c00677737944df40c8ff65a78c8683ee5840398d33b26331b8ffae34329a66 +size 15984 diff --git a/checkpoint-9500/rng_state_4.pth b/checkpoint-9500/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8fa24f2e40ed9e76623e434aa27bfd7cc3986124 --- /dev/null +++ b/checkpoint-9500/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4334aeb339331630c029df65f02b5f93c703add9ea0d60e353d6019e12f0e82 +size 15984 diff --git a/checkpoint-9500/rng_state_5.pth b/checkpoint-9500/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..afc5841119527240fdfb5f8a19378e075949c28f --- /dev/null +++ b/checkpoint-9500/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9485699ba1f8731cde5cfd77a6ce95b8f8317a85bb09a3ba2489b4067501492d +size 15984 diff --git a/checkpoint-9500/rng_state_6.pth b/checkpoint-9500/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..e16587aff85f32da28c27df57e5d88a833437559 --- /dev/null +++ b/checkpoint-9500/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe104ee345b7463caaac129168eb06b2b53a8623037bf23c4cd2aa8de52ca6f0 +size 15984 diff --git a/checkpoint-9500/rng_state_7.pth b/checkpoint-9500/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..819388e8fc56c0ece794441f03abcf63261ca9a4 --- /dev/null +++ b/checkpoint-9500/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30ec7c0eb2cf7359b7a47c2eb263f7cc67dc1fc25de0de1b4a6e5b7b679cac4c +size 15984 diff --git a/checkpoint-9500/scheduler.pt b/checkpoint-9500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a327077f2a1a1fe52c3963dc81aa02eecbcdae07 --- /dev/null +++ b/checkpoint-9500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24954beb59590793d5fc0ddbe7d1a6127ce1ae2ee15502114a025e447199d147 +size 1064 diff --git a/checkpoint-9500/trainer_state.json b/checkpoint-9500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..77792638bb1f4ada48578894896a53d7732c6ac8 --- /dev/null +++ b/checkpoint-9500/trainer_state.json @@ -0,0 +1,66533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.587859424920127, + "eval_steps": 500, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007987220447284345, + "grad_norm": 0.08758673816919327, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1 + }, + { + "epoch": 0.001597444089456869, + "grad_norm": 2.9034857749938965, + "learning_rate": 0.0005, + "loss": 1.5342, + "step": 2 + }, + { + "epoch": 0.0023961661341853034, + "grad_norm": 1.260856032371521, + "learning_rate": 0.0005, + "loss": 1.3074, + "step": 3 + }, + { + "epoch": 0.003194888178913738, + "grad_norm": 2.2480077743530273, + "learning_rate": 0.0005, + "loss": 1.3434, + "step": 4 + }, + { + "epoch": 0.003993610223642172, + "grad_norm": 0.6822420358657837, + "learning_rate": 0.0005, + "loss": 1.2075, + "step": 5 + }, + { + "epoch": 0.004792332268370607, + "grad_norm": 0.7826036214828491, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 6 + }, + { + "epoch": 0.005591054313099041, + "grad_norm": 0.690284788608551, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 7 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 0.49136775732040405, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 8 + }, + { + "epoch": 0.00718849840255591, + "grad_norm": 0.3124309182167053, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 9 + }, + { + "epoch": 0.007987220447284345, + "grad_norm": 0.3409576714038849, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 10 + }, + { + "epoch": 0.00878594249201278, + "grad_norm": 0.25508174300193787, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 11 + }, + { + "epoch": 0.009584664536741214, + "grad_norm": 0.19042040407657623, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 12 + }, + { + "epoch": 0.010383386581469648, + "grad_norm": 0.2090323120355606, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 13 + }, + { + "epoch": 0.011182108626198083, + "grad_norm": 0.2102068066596985, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 14 + }, + { + "epoch": 0.011980830670926517, + "grad_norm": 0.12789177894592285, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 15 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 0.10204717516899109, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 16 + }, + { + "epoch": 0.013578274760383386, + "grad_norm": 0.174830362200737, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 17 + }, + { + "epoch": 0.01437699680511182, + "grad_norm": 0.25637468695640564, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 18 + }, + { + "epoch": 0.015175718849840255, + "grad_norm": 0.28002411127090454, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 19 + }, + { + "epoch": 0.01597444089456869, + "grad_norm": 0.23047354817390442, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 20 + }, + { + "epoch": 0.016773162939297124, + "grad_norm": 0.1548614650964737, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 21 + }, + { + "epoch": 0.01757188498402556, + "grad_norm": 0.07078541815280914, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 22 + }, + { + "epoch": 0.018370607028753993, + "grad_norm": 0.10615550726652145, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 23 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 0.10240291804075241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 24 + }, + { + "epoch": 0.019968051118210862, + "grad_norm": 0.07588993012905121, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 25 + }, + { + "epoch": 0.020766773162939296, + "grad_norm": 0.06380276381969452, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 26 + }, + { + "epoch": 0.02156549520766773, + "grad_norm": 0.06891524791717529, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 27 + }, + { + "epoch": 0.022364217252396165, + "grad_norm": 0.0625377744436264, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 28 + }, + { + "epoch": 0.0231629392971246, + "grad_norm": 0.12064792215824127, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 29 + }, + { + "epoch": 0.023961661341853034, + "grad_norm": 0.29220151901245117, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 30 + }, + { + "epoch": 0.02476038338658147, + "grad_norm": 0.7822219729423523, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 31 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 1.5172864198684692, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 32 + }, + { + "epoch": 0.026357827476038338, + "grad_norm": 0.18434809148311615, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 33 + }, + { + "epoch": 0.027156549520766772, + "grad_norm": 0.535632848739624, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 34 + }, + { + "epoch": 0.027955271565495207, + "grad_norm": 0.21549028158187866, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 35 + }, + { + "epoch": 0.02875399361022364, + "grad_norm": 0.4726889431476593, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 36 + }, + { + "epoch": 0.029552715654952075, + "grad_norm": 0.2519988417625427, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 37 + }, + { + "epoch": 0.03035143769968051, + "grad_norm": 0.2973701059818268, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 38 + }, + { + "epoch": 0.031150159744408944, + "grad_norm": 0.30153587460517883, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 39 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 0.08746712654829025, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 40 + }, + { + "epoch": 0.03274760383386582, + "grad_norm": 0.3308769762516022, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 41 + }, + { + "epoch": 0.03354632587859425, + "grad_norm": 0.10948555171489716, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 42 + }, + { + "epoch": 0.034345047923322686, + "grad_norm": 0.3044797480106354, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 43 + }, + { + "epoch": 0.03514376996805112, + "grad_norm": 0.11677752435207367, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 44 + }, + { + "epoch": 0.035942492012779555, + "grad_norm": 0.30327609181404114, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 45 + }, + { + "epoch": 0.036741214057507986, + "grad_norm": 0.10603009909391403, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 46 + }, + { + "epoch": 0.037539936102236424, + "grad_norm": 0.2693077623844147, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 47 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.11918680369853973, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 48 + }, + { + "epoch": 0.03913738019169329, + "grad_norm": 0.2965734899044037, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 49 + }, + { + "epoch": 0.039936102236421724, + "grad_norm": 0.10428953915834427, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 50 + }, + { + "epoch": 0.04073482428115016, + "grad_norm": 0.23307208716869354, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 51 + }, + { + "epoch": 0.04153354632587859, + "grad_norm": 0.07401563227176666, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 52 + }, + { + "epoch": 0.04233226837060703, + "grad_norm": 0.22344312071800232, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 53 + }, + { + "epoch": 0.04313099041533546, + "grad_norm": 0.1782081127166748, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 54 + }, + { + "epoch": 0.0439297124600639, + "grad_norm": 0.10123606026172638, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 55 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 0.2618716359138489, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 56 + }, + { + "epoch": 0.04552715654952077, + "grad_norm": 0.15046533942222595, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 57 + }, + { + "epoch": 0.0463258785942492, + "grad_norm": 0.1341097205877304, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 58 + }, + { + "epoch": 0.04712460063897764, + "grad_norm": 0.20391245186328888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 59 + }, + { + "epoch": 0.04792332268370607, + "grad_norm": 0.09610722959041595, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 60 + }, + { + "epoch": 0.048722044728434506, + "grad_norm": 0.09877557307481766, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 61 + }, + { + "epoch": 0.04952076677316294, + "grad_norm": 0.16971156001091003, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 62 + }, + { + "epoch": 0.050319488817891375, + "grad_norm": 0.1819174885749817, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 63 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.13067278265953064, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 64 + }, + { + "epoch": 0.051916932907348244, + "grad_norm": 0.10557633638381958, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 65 + }, + { + "epoch": 0.052715654952076675, + "grad_norm": 0.08713806420564651, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 66 + }, + { + "epoch": 0.05351437699680511, + "grad_norm": 0.12453104555606842, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 67 + }, + { + "epoch": 0.054313099041533544, + "grad_norm": 0.19147996604442596, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 68 + }, + { + "epoch": 0.05511182108626198, + "grad_norm": 0.21808673441410065, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 69 + }, + { + "epoch": 0.05591054313099041, + "grad_norm": 0.15922780334949493, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 70 + }, + { + "epoch": 0.05670926517571885, + "grad_norm": 0.09400095790624619, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 71 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 0.071605384349823, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 72 + }, + { + "epoch": 0.05830670926517572, + "grad_norm": 0.08754080533981323, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 73 + }, + { + "epoch": 0.05910543130990415, + "grad_norm": 0.07777409255504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 74 + }, + { + "epoch": 0.05990415335463259, + "grad_norm": 0.04577887803316116, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 75 + }, + { + "epoch": 0.06070287539936102, + "grad_norm": 0.07278449088335037, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 76 + }, + { + "epoch": 0.06150159744408946, + "grad_norm": 0.06739042699337006, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 77 + }, + { + "epoch": 0.06230031948881789, + "grad_norm": 0.06367938220500946, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 78 + }, + { + "epoch": 0.06309904153354633, + "grad_norm": 0.0551401786506176, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 79 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.04846199229359627, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 80 + }, + { + "epoch": 0.06469648562300319, + "grad_norm": 0.089615598320961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 81 + }, + { + "epoch": 0.06549520766773163, + "grad_norm": 0.19073566794395447, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 82 + }, + { + "epoch": 0.06629392971246006, + "grad_norm": 0.26971691846847534, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 83 + }, + { + "epoch": 0.0670926517571885, + "grad_norm": 0.3124604821205139, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 84 + }, + { + "epoch": 0.06789137380191693, + "grad_norm": 0.3448403775691986, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 85 + }, + { + "epoch": 0.06869009584664537, + "grad_norm": 0.2708166837692261, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 86 + }, + { + "epoch": 0.0694888178913738, + "grad_norm": 0.10507494956254959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 87 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 0.1015392392873764, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 88 + }, + { + "epoch": 0.07108626198083066, + "grad_norm": 0.34002622961997986, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 89 + }, + { + "epoch": 0.07188498402555911, + "grad_norm": 0.5238372683525085, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 90 + }, + { + "epoch": 0.07268370607028754, + "grad_norm": 0.5267866253852844, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 91 + }, + { + "epoch": 0.07348242811501597, + "grad_norm": 0.3286864757537842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 92 + }, + { + "epoch": 0.0742811501597444, + "grad_norm": 0.14270304143428802, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 93 + }, + { + "epoch": 0.07507987220447285, + "grad_norm": 0.3481365740299225, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 94 + }, + { + "epoch": 0.07587859424920128, + "grad_norm": 0.33883902430534363, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 95 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.2553725838661194, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 96 + }, + { + "epoch": 0.07747603833865814, + "grad_norm": 0.21944141387939453, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 97 + }, + { + "epoch": 0.07827476038338659, + "grad_norm": 0.18821558356285095, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 98 + }, + { + "epoch": 0.07907348242811502, + "grad_norm": 0.20073482394218445, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 99 + }, + { + "epoch": 0.07987220447284345, + "grad_norm": 0.2643139958381653, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 100 + }, + { + "epoch": 0.08067092651757188, + "grad_norm": 0.1843930184841156, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 101 + }, + { + "epoch": 0.08146964856230032, + "grad_norm": 0.12745684385299683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 102 + }, + { + "epoch": 0.08226837060702875, + "grad_norm": 0.3252592384815216, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 103 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 0.33775797486305237, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 104 + }, + { + "epoch": 0.08386581469648563, + "grad_norm": 0.24846483767032623, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 105 + }, + { + "epoch": 0.08466453674121406, + "grad_norm": 0.1598653495311737, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 106 + }, + { + "epoch": 0.08546325878594249, + "grad_norm": 0.2555698752403259, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 107 + }, + { + "epoch": 0.08626198083067092, + "grad_norm": 0.3770487308502197, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 108 + }, + { + "epoch": 0.08706070287539937, + "grad_norm": 0.3179391026496887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 109 + }, + { + "epoch": 0.0878594249201278, + "grad_norm": 0.11638858914375305, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 110 + }, + { + "epoch": 0.08865814696485623, + "grad_norm": 0.20365215837955475, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 111 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.22354111075401306, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 112 + }, + { + "epoch": 0.0902555910543131, + "grad_norm": 0.1944236010313034, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 113 + }, + { + "epoch": 0.09105431309904154, + "grad_norm": 0.16177603602409363, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 114 + }, + { + "epoch": 0.09185303514376997, + "grad_norm": 0.06650812178850174, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 115 + }, + { + "epoch": 0.0926517571884984, + "grad_norm": 0.20236945152282715, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 116 + }, + { + "epoch": 0.09345047923322684, + "grad_norm": 0.19086670875549316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 117 + }, + { + "epoch": 0.09424920127795527, + "grad_norm": 0.17380473017692566, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 118 + }, + { + "epoch": 0.0950479233226837, + "grad_norm": 0.11360115557909012, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 119 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 0.09359298646450043, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 120 + }, + { + "epoch": 0.09664536741214058, + "grad_norm": 0.15317411720752716, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 121 + }, + { + "epoch": 0.09744408945686901, + "grad_norm": 0.05564137175679207, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 122 + }, + { + "epoch": 0.09824281150159744, + "grad_norm": 0.13476046919822693, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 123 + }, + { + "epoch": 0.09904153354632587, + "grad_norm": 0.11372318118810654, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 124 + }, + { + "epoch": 0.09984025559105432, + "grad_norm": 0.11330179125070572, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 125 + }, + { + "epoch": 0.10063897763578275, + "grad_norm": 0.11304716765880585, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 126 + }, + { + "epoch": 0.10143769968051118, + "grad_norm": 0.06369871646165848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 127 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.14034464955329895, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 128 + }, + { + "epoch": 0.10303514376996806, + "grad_norm": 0.1080808937549591, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 129 + }, + { + "epoch": 0.10383386581469649, + "grad_norm": 0.09568007290363312, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 130 + }, + { + "epoch": 0.10463258785942492, + "grad_norm": 0.1359473019838333, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 131 + }, + { + "epoch": 0.10543130990415335, + "grad_norm": 0.06500346213579178, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 132 + }, + { + "epoch": 0.1062300319488818, + "grad_norm": 0.11564832180738449, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 133 + }, + { + "epoch": 0.10702875399361023, + "grad_norm": 0.2115149199962616, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 134 + }, + { + "epoch": 0.10782747603833866, + "grad_norm": 0.3098243772983551, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 135 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 0.446521133184433, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 136 + }, + { + "epoch": 0.10942492012779553, + "grad_norm": 0.5194831490516663, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 137 + }, + { + "epoch": 0.11022364217252396, + "grad_norm": 0.447731077671051, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 138 + }, + { + "epoch": 0.1110223642172524, + "grad_norm": 0.2195945680141449, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 139 + }, + { + "epoch": 0.11182108626198083, + "grad_norm": 0.1277567446231842, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 140 + }, + { + "epoch": 0.11261980830670927, + "grad_norm": 0.3284558355808258, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 141 + }, + { + "epoch": 0.1134185303514377, + "grad_norm": 0.40208715200424194, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 142 + }, + { + "epoch": 0.11421725239616613, + "grad_norm": 0.28310486674308777, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 143 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.0786294937133789, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 144 + }, + { + "epoch": 0.11581469648562301, + "grad_norm": 0.18283484876155853, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 145 + }, + { + "epoch": 0.11661341853035144, + "grad_norm": 0.20186439156532288, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 146 + }, + { + "epoch": 0.11741214057507987, + "grad_norm": 0.15860706567764282, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 147 + }, + { + "epoch": 0.1182108626198083, + "grad_norm": 0.1436982899904251, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 148 + }, + { + "epoch": 0.11900958466453675, + "grad_norm": 0.15206722915172577, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 149 + }, + { + "epoch": 0.11980830670926518, + "grad_norm": 0.252279132604599, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 150 + }, + { + "epoch": 0.12060702875399361, + "grad_norm": 0.19411228597164154, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 151 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 0.07377714663743973, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 152 + }, + { + "epoch": 0.12220447284345048, + "grad_norm": 0.15493856370449066, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 153 + }, + { + "epoch": 0.12300319488817892, + "grad_norm": 0.275601863861084, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 154 + }, + { + "epoch": 0.12380191693290735, + "grad_norm": 0.42461103200912476, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 155 + }, + { + "epoch": 0.12460063897763578, + "grad_norm": 0.41153159737586975, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 156 + }, + { + "epoch": 0.1253993610223642, + "grad_norm": 0.2487967610359192, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 157 + }, + { + "epoch": 0.12619808306709265, + "grad_norm": 0.10687623918056488, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 158 + }, + { + "epoch": 0.1269968051118211, + "grad_norm": 0.28695282340049744, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 159 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.38554099202156067, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 160 + }, + { + "epoch": 0.12859424920127796, + "grad_norm": 0.25622498989105225, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 161 + }, + { + "epoch": 0.12939297124600638, + "grad_norm": 0.10341542959213257, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 162 + }, + { + "epoch": 0.13019169329073482, + "grad_norm": 0.20450755953788757, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 163 + }, + { + "epoch": 0.13099041533546327, + "grad_norm": 0.2664271295070648, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 164 + }, + { + "epoch": 0.13178913738019168, + "grad_norm": 0.23936089873313904, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 165 + }, + { + "epoch": 0.13258785942492013, + "grad_norm": 0.0662769302725792, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 166 + }, + { + "epoch": 0.13338658146964857, + "grad_norm": 0.13597780466079712, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 167 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 0.15996500849723816, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 168 + }, + { + "epoch": 0.13498402555910544, + "grad_norm": 0.10095447301864624, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 169 + }, + { + "epoch": 0.13578274760383385, + "grad_norm": 0.09733449667692184, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 170 + }, + { + "epoch": 0.1365814696485623, + "grad_norm": 0.16480964422225952, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 171 + }, + { + "epoch": 0.13738019169329074, + "grad_norm": 0.21611596643924713, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 172 + }, + { + "epoch": 0.13817891373801916, + "grad_norm": 0.21607941389083862, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 173 + }, + { + "epoch": 0.1389776357827476, + "grad_norm": 0.2234959453344345, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 174 + }, + { + "epoch": 0.13977635782747605, + "grad_norm": 0.10778137296438217, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 175 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.1758418083190918, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 176 + }, + { + "epoch": 0.1413738019169329, + "grad_norm": 0.30717936158180237, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 177 + }, + { + "epoch": 0.14217252396166133, + "grad_norm": 0.3382156789302826, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 178 + }, + { + "epoch": 0.14297124600638977, + "grad_norm": 0.23189185559749603, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 179 + }, + { + "epoch": 0.14376996805111822, + "grad_norm": 0.04988733306527138, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 180 + }, + { + "epoch": 0.14456869009584664, + "grad_norm": 0.15606579184532166, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 181 + }, + { + "epoch": 0.14536741214057508, + "grad_norm": 0.2366417795419693, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 182 + }, + { + "epoch": 0.14616613418530353, + "grad_norm": 0.21878089010715485, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 183 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 0.09316077083349228, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 184 + }, + { + "epoch": 0.1477635782747604, + "grad_norm": 0.119263656437397, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 185 + }, + { + "epoch": 0.1485623003194888, + "grad_norm": 0.26743847131729126, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 186 + }, + { + "epoch": 0.14936102236421725, + "grad_norm": 0.34438276290893555, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 187 + }, + { + "epoch": 0.1501597444089457, + "grad_norm": 0.30809128284454346, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 188 + }, + { + "epoch": 0.1509584664536741, + "grad_norm": 0.1406010240316391, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 189 + }, + { + "epoch": 0.15175718849840256, + "grad_norm": 0.09509757161140442, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 190 + }, + { + "epoch": 0.152555910543131, + "grad_norm": 0.24529854953289032, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 191 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.2803219258785248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 192 + }, + { + "epoch": 0.15415335463258786, + "grad_norm": 0.18221652507781982, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 193 + }, + { + "epoch": 0.15495207667731628, + "grad_norm": 0.04752795770764351, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 194 + }, + { + "epoch": 0.15575079872204473, + "grad_norm": 0.14151020348072052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 195 + }, + { + "epoch": 0.15654952076677317, + "grad_norm": 0.27345412969589233, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 196 + }, + { + "epoch": 0.1573482428115016, + "grad_norm": 0.36259710788726807, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 197 + }, + { + "epoch": 0.15814696485623003, + "grad_norm": 0.30899694561958313, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 198 + }, + { + "epoch": 0.15894568690095848, + "grad_norm": 0.148394376039505, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 199 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 0.09150427579879761, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 200 + }, + { + "epoch": 0.16054313099041534, + "grad_norm": 0.2579229176044464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 201 + }, + { + "epoch": 0.16134185303514376, + "grad_norm": 0.35417553782463074, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 202 + }, + { + "epoch": 0.1621405750798722, + "grad_norm": 0.3410634994506836, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 203 + }, + { + "epoch": 0.16293929712460065, + "grad_norm": 0.20597697794437408, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 204 + }, + { + "epoch": 0.16373801916932906, + "grad_norm": 0.09722702950239182, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 205 + }, + { + "epoch": 0.1645367412140575, + "grad_norm": 0.29214075207710266, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 206 + }, + { + "epoch": 0.16533546325878595, + "grad_norm": 0.35695526003837585, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 207 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.23948919773101807, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 208 + }, + { + "epoch": 0.16693290734824281, + "grad_norm": 0.06467479467391968, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 209 + }, + { + "epoch": 0.16773162939297126, + "grad_norm": 0.2935601472854614, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 210 + }, + { + "epoch": 0.16853035143769968, + "grad_norm": 0.3354688882827759, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 211 + }, + { + "epoch": 0.16932907348242812, + "grad_norm": 0.206736221909523, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 212 + }, + { + "epoch": 0.17012779552715654, + "grad_norm": 0.04770192503929138, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 213 + }, + { + "epoch": 0.17092651757188498, + "grad_norm": 0.11713571101427078, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 214 + }, + { + "epoch": 0.17172523961661343, + "grad_norm": 0.1751943975687027, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 215 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 0.11709283292293549, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 216 + }, + { + "epoch": 0.1733226837060703, + "grad_norm": 0.08393140882253647, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 217 + }, + { + "epoch": 0.17412140575079874, + "grad_norm": 0.14036497473716736, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 218 + }, + { + "epoch": 0.17492012779552715, + "grad_norm": 0.19809649884700775, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 219 + }, + { + "epoch": 0.1757188498402556, + "grad_norm": 0.16380994021892548, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 220 + }, + { + "epoch": 0.17651757188498401, + "grad_norm": 0.03721015155315399, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 221 + }, + { + "epoch": 0.17731629392971246, + "grad_norm": 0.16769659519195557, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 222 + }, + { + "epoch": 0.1781150159744409, + "grad_norm": 0.2506882846355438, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 223 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 0.2812851667404175, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 224 + }, + { + "epoch": 0.17971246006389777, + "grad_norm": 0.2518095374107361, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 225 + }, + { + "epoch": 0.1805111821086262, + "grad_norm": 0.13027259707450867, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 226 + }, + { + "epoch": 0.18130990415335463, + "grad_norm": 0.051758985966444016, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 227 + }, + { + "epoch": 0.18210862619808307, + "grad_norm": 0.123250812292099, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 228 + }, + { + "epoch": 0.1829073482428115, + "grad_norm": 0.16475827991962433, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 229 + }, + { + "epoch": 0.18370607028753994, + "grad_norm": 0.15224772691726685, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 230 + }, + { + "epoch": 0.18450479233226838, + "grad_norm": 0.10693283379077911, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 231 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 0.059128716588020325, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 232 + }, + { + "epoch": 0.18610223642172524, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 233 + }, + { + "epoch": 0.1869009584664537, + "grad_norm": 0.21447211503982544, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 234 + }, + { + "epoch": 0.1876996805111821, + "grad_norm": 0.214809849858284, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 235 + }, + { + "epoch": 0.18849840255591055, + "grad_norm": 0.16398873925209045, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 236 + }, + { + "epoch": 0.18929712460063897, + "grad_norm": 0.08273304253816605, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 237 + }, + { + "epoch": 0.1900958466453674, + "grad_norm": 0.08456159383058548, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 238 + }, + { + "epoch": 0.19089456869009586, + "grad_norm": 0.09653522819280624, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 239 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.13169406354427338, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 240 + }, + { + "epoch": 0.19249201277955272, + "grad_norm": 0.2328217476606369, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 241 + }, + { + "epoch": 0.19329073482428116, + "grad_norm": 0.2226463258266449, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 242 + }, + { + "epoch": 0.19408945686900958, + "grad_norm": 0.13330090045928955, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 243 + }, + { + "epoch": 0.19488817891373802, + "grad_norm": 0.15685412287712097, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 244 + }, + { + "epoch": 0.19568690095846644, + "grad_norm": 0.1528809666633606, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 245 + }, + { + "epoch": 0.1964856230031949, + "grad_norm": 0.2380320429801941, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 246 + }, + { + "epoch": 0.19728434504792333, + "grad_norm": 0.20447947084903717, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 247 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 0.162733793258667, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 248 + }, + { + "epoch": 0.1988817891373802, + "grad_norm": 0.10536827147006989, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 249 + }, + { + "epoch": 0.19968051118210864, + "grad_norm": 0.05464514344930649, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 250 + }, + { + "epoch": 0.20047923322683706, + "grad_norm": 0.052793700248003006, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 251 + }, + { + "epoch": 0.2012779552715655, + "grad_norm": 0.06936854124069214, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 252 + }, + { + "epoch": 0.20207667731629392, + "grad_norm": 0.17630355060100555, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 253 + }, + { + "epoch": 0.20287539936102236, + "grad_norm": 0.23443830013275146, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 254 + }, + { + "epoch": 0.2036741214057508, + "grad_norm": 0.21788854897022247, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 255 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.16827379167079926, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 256 + }, + { + "epoch": 0.20527156549520767, + "grad_norm": 0.08467451483011246, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 257 + }, + { + "epoch": 0.20607028753993611, + "grad_norm": 0.17747341096401215, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 258 + }, + { + "epoch": 0.20686900958466453, + "grad_norm": 0.20212751626968384, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 259 + }, + { + "epoch": 0.20766773162939298, + "grad_norm": 0.13319599628448486, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 260 + }, + { + "epoch": 0.2084664536741214, + "grad_norm": 0.13839752972126007, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 261 + }, + { + "epoch": 0.20926517571884984, + "grad_norm": 0.12351422011852264, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 262 + }, + { + "epoch": 0.21006389776357828, + "grad_norm": 0.1166408434510231, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 263 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 0.15500681102275848, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 264 + }, + { + "epoch": 0.21166134185303515, + "grad_norm": 0.045156076550483704, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 265 + }, + { + "epoch": 0.2124600638977636, + "grad_norm": 0.1413601189851761, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 266 + }, + { + "epoch": 0.213258785942492, + "grad_norm": 0.19309845566749573, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 267 + }, + { + "epoch": 0.21405750798722045, + "grad_norm": 0.22837650775909424, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 268 + }, + { + "epoch": 0.21485623003194887, + "grad_norm": 0.23372405767440796, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 269 + }, + { + "epoch": 0.21565495207667731, + "grad_norm": 0.2030618041753769, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 270 + }, + { + "epoch": 0.21645367412140576, + "grad_norm": 0.2092818021774292, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 271 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.18329963088035583, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 272 + }, + { + "epoch": 0.21805111821086262, + "grad_norm": 0.07353675365447998, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 273 + }, + { + "epoch": 0.21884984025559107, + "grad_norm": 0.08853492140769958, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 274 + }, + { + "epoch": 0.21964856230031948, + "grad_norm": 0.14666804671287537, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 275 + }, + { + "epoch": 0.22044728434504793, + "grad_norm": 0.12529602646827698, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 276 + }, + { + "epoch": 0.22124600638977635, + "grad_norm": 0.1571074277162552, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 277 + }, + { + "epoch": 0.2220447284345048, + "grad_norm": 0.09636949002742767, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 278 + }, + { + "epoch": 0.22284345047923323, + "grad_norm": 0.16803453862667084, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 279 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 0.258849561214447, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 280 + }, + { + "epoch": 0.2244408945686901, + "grad_norm": 0.29162102937698364, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 281 + }, + { + "epoch": 0.22523961661341854, + "grad_norm": 0.32085782289505005, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 282 + }, + { + "epoch": 0.22603833865814696, + "grad_norm": 0.24114084243774414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 283 + }, + { + "epoch": 0.2268370607028754, + "grad_norm": 0.11804991215467453, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 284 + }, + { + "epoch": 0.22763578274760382, + "grad_norm": 0.16640789806842804, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 285 + }, + { + "epoch": 0.22843450479233227, + "grad_norm": 0.33951282501220703, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 286 + }, + { + "epoch": 0.2292332268370607, + "grad_norm": 0.3939269483089447, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 287 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.2742229402065277, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 288 + }, + { + "epoch": 0.23083067092651757, + "grad_norm": 0.1000385507941246, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 289 + }, + { + "epoch": 0.23162939297124602, + "grad_norm": 0.15618765354156494, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 290 + }, + { + "epoch": 0.23242811501597443, + "grad_norm": 0.3464474081993103, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 291 + }, + { + "epoch": 0.23322683706070288, + "grad_norm": 0.4524421989917755, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 292 + }, + { + "epoch": 0.2340255591054313, + "grad_norm": 0.38890203833580017, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 293 + }, + { + "epoch": 0.23482428115015974, + "grad_norm": 0.15225796401500702, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 294 + }, + { + "epoch": 0.2356230031948882, + "grad_norm": 0.18742015957832336, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 295 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 0.454607754945755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 296 + }, + { + "epoch": 0.23722044728434505, + "grad_norm": 0.4426102638244629, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 297 + }, + { + "epoch": 0.2380191693290735, + "grad_norm": 0.1442587673664093, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 298 + }, + { + "epoch": 0.2388178913738019, + "grad_norm": 0.2338172197341919, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 299 + }, + { + "epoch": 0.23961661341853036, + "grad_norm": 0.4115936756134033, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 300 + }, + { + "epoch": 0.24041533546325877, + "grad_norm": 0.38746342062950134, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 301 + }, + { + "epoch": 0.24121405750798722, + "grad_norm": 0.11506912112236023, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 302 + }, + { + "epoch": 0.24201277955271566, + "grad_norm": 0.20454810559749603, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 303 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.34620603919029236, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 304 + }, + { + "epoch": 0.24361022364217252, + "grad_norm": 0.27727624773979187, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 305 + }, + { + "epoch": 0.24440894568690097, + "grad_norm": 0.062395140528678894, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 306 + }, + { + "epoch": 0.2452076677316294, + "grad_norm": 0.25391891598701477, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 307 + }, + { + "epoch": 0.24600638977635783, + "grad_norm": 0.3807840049266815, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 308 + }, + { + "epoch": 0.24680511182108625, + "grad_norm": 0.31564414501190186, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 309 + }, + { + "epoch": 0.2476038338658147, + "grad_norm": 0.044667672365903854, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 310 + }, + { + "epoch": 0.24840255591054314, + "grad_norm": 0.2656041979789734, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 311 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 0.2954655587673187, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 312 + }, + { + "epoch": 0.25, + "grad_norm": 0.14636820554733276, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 313 + }, + { + "epoch": 0.2507987220447284, + "grad_norm": 0.16759099066257477, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 314 + }, + { + "epoch": 0.2515974440894569, + "grad_norm": 0.28777605295181274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 315 + }, + { + "epoch": 0.2523961661341853, + "grad_norm": 0.2817089855670929, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 316 + }, + { + "epoch": 0.2531948881789137, + "grad_norm": 0.09457004815340042, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 317 + }, + { + "epoch": 0.2539936102236422, + "grad_norm": 0.15224558115005493, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 318 + }, + { + "epoch": 0.2547923322683706, + "grad_norm": 0.17883236706256866, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 319 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.08269336074590683, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 320 + }, + { + "epoch": 0.2563897763578275, + "grad_norm": 0.10430650413036346, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 321 + }, + { + "epoch": 0.2571884984025559, + "grad_norm": 0.06464210897684097, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 322 + }, + { + "epoch": 0.25798722044728434, + "grad_norm": 0.08100844919681549, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 323 + }, + { + "epoch": 0.25878594249201275, + "grad_norm": 0.10375291109085083, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 324 + }, + { + "epoch": 0.2595846645367412, + "grad_norm": 0.14621509611606598, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 325 + }, + { + "epoch": 0.26038338658146964, + "grad_norm": 0.12707975506782532, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 326 + }, + { + "epoch": 0.26118210862619806, + "grad_norm": 0.04542430862784386, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 327 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 0.13504259288311005, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 328 + }, + { + "epoch": 0.26277955271565495, + "grad_norm": 0.20337320864200592, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 329 + }, + { + "epoch": 0.26357827476038337, + "grad_norm": 0.23682020604610443, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 330 + }, + { + "epoch": 0.26437699680511184, + "grad_norm": 0.15198387205600739, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 331 + }, + { + "epoch": 0.26517571884984026, + "grad_norm": 0.04014969989657402, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 332 + }, + { + "epoch": 0.2659744408945687, + "grad_norm": 0.10505357384681702, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 333 + }, + { + "epoch": 0.26677316293929715, + "grad_norm": 0.08121145516633987, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 334 + }, + { + "epoch": 0.26757188498402557, + "grad_norm": 0.062118109315633774, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 335 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.13389311730861664, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 336 + }, + { + "epoch": 0.26916932907348246, + "grad_norm": 0.24840199947357178, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 337 + }, + { + "epoch": 0.26996805111821087, + "grad_norm": 0.33511659502983093, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 338 + }, + { + "epoch": 0.2707667731629393, + "grad_norm": 0.2905866801738739, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 339 + }, + { + "epoch": 0.2715654952076677, + "grad_norm": 0.15471668541431427, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 340 + }, + { + "epoch": 0.2723642172523962, + "grad_norm": 0.09973842650651932, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 341 + }, + { + "epoch": 0.2731629392971246, + "grad_norm": 0.19315758347511292, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 342 + }, + { + "epoch": 0.273961661341853, + "grad_norm": 0.2122231423854828, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 343 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 0.11207931488752365, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 344 + }, + { + "epoch": 0.2755591054313099, + "grad_norm": 0.11863203346729279, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 345 + }, + { + "epoch": 0.2763578274760383, + "grad_norm": 0.22022183239459991, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 346 + }, + { + "epoch": 0.2771565495207668, + "grad_norm": 0.225724458694458, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 347 + }, + { + "epoch": 0.2779552715654952, + "grad_norm": 0.1622191071510315, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 348 + }, + { + "epoch": 0.2787539936102236, + "grad_norm": 0.05987359210848808, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 349 + }, + { + "epoch": 0.2795527156549521, + "grad_norm": 0.08514829725027084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 350 + }, + { + "epoch": 0.2803514376996805, + "grad_norm": 0.10734611004590988, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 351 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.12458663433790207, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 352 + }, + { + "epoch": 0.2819488817891374, + "grad_norm": 0.12223048508167267, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 353 + }, + { + "epoch": 0.2827476038338658, + "grad_norm": 0.0663333311676979, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 354 + }, + { + "epoch": 0.28354632587859424, + "grad_norm": 0.0628359317779541, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 355 + }, + { + "epoch": 0.28434504792332266, + "grad_norm": 0.1566074788570404, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 356 + }, + { + "epoch": 0.28514376996805113, + "grad_norm": 0.23291122913360596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 357 + }, + { + "epoch": 0.28594249201277955, + "grad_norm": 0.21403467655181885, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 358 + }, + { + "epoch": 0.28674121405750796, + "grad_norm": 0.08412498980760574, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 359 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 0.1415901631116867, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 360 + }, + { + "epoch": 0.28833865814696485, + "grad_norm": 0.29960349202156067, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 361 + }, + { + "epoch": 0.28913738019169327, + "grad_norm": 0.33849450945854187, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 362 + }, + { + "epoch": 0.28993610223642174, + "grad_norm": 0.24428068101406097, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 363 + }, + { + "epoch": 0.29073482428115016, + "grad_norm": 0.07897785305976868, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 364 + }, + { + "epoch": 0.2915335463258786, + "grad_norm": 0.1347426027059555, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 365 + }, + { + "epoch": 0.29233226837060705, + "grad_norm": 0.21387724578380585, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 366 + }, + { + "epoch": 0.29313099041533547, + "grad_norm": 0.13869348168373108, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 367 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.062060993164777756, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 368 + }, + { + "epoch": 0.29472843450479236, + "grad_norm": 0.13848915696144104, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 369 + }, + { + "epoch": 0.2955271565495208, + "grad_norm": 0.12179117649793625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 370 + }, + { + "epoch": 0.2963258785942492, + "grad_norm": 0.13039280474185944, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 371 + }, + { + "epoch": 0.2971246006389776, + "grad_norm": 0.09119348227977753, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 372 + }, + { + "epoch": 0.2979233226837061, + "grad_norm": 0.06374438107013702, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 373 + }, + { + "epoch": 0.2987220447284345, + "grad_norm": 0.1524113267660141, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 374 + }, + { + "epoch": 0.2995207667731629, + "grad_norm": 0.18103912472724915, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 375 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 0.1439986377954483, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 376 + }, + { + "epoch": 0.3011182108626198, + "grad_norm": 0.1268371045589447, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 377 + }, + { + "epoch": 0.3019169329073482, + "grad_norm": 0.07370569556951523, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 378 + }, + { + "epoch": 0.3027156549520767, + "grad_norm": 0.0718536451458931, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 379 + }, + { + "epoch": 0.3035143769968051, + "grad_norm": 0.10444384068250656, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 380 + }, + { + "epoch": 0.30431309904153353, + "grad_norm": 0.10085552930831909, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 381 + }, + { + "epoch": 0.305111821086262, + "grad_norm": 0.08599484711885452, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 382 + }, + { + "epoch": 0.3059105431309904, + "grad_norm": 0.08912923187017441, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 383 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.17919759452342987, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 384 + }, + { + "epoch": 0.3075079872204473, + "grad_norm": 0.23954501748085022, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 385 + }, + { + "epoch": 0.3083067092651757, + "grad_norm": 0.2940942645072937, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 386 + }, + { + "epoch": 0.30910543130990414, + "grad_norm": 0.2905970513820648, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 387 + }, + { + "epoch": 0.30990415335463256, + "grad_norm": 0.2555491626262665, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 388 + }, + { + "epoch": 0.31070287539936103, + "grad_norm": 0.15303272008895874, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 389 + }, + { + "epoch": 0.31150159744408945, + "grad_norm": 0.10148895531892776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 390 + }, + { + "epoch": 0.31230031948881787, + "grad_norm": 0.21828792989253998, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 391 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 0.27219685912132263, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 392 + }, + { + "epoch": 0.31389776357827476, + "grad_norm": 0.3431699872016907, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 393 + }, + { + "epoch": 0.3146964856230032, + "grad_norm": 0.32346805930137634, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 394 + }, + { + "epoch": 0.31549520766773165, + "grad_norm": 0.17791730165481567, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 395 + }, + { + "epoch": 0.31629392971246006, + "grad_norm": 0.09576063603162766, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 396 + }, + { + "epoch": 0.3170926517571885, + "grad_norm": 0.050598498433828354, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 397 + }, + { + "epoch": 0.31789137380191695, + "grad_norm": 0.07385009527206421, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 398 + }, + { + "epoch": 0.31869009584664537, + "grad_norm": 0.08680527657270432, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 399 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.06436332315206528, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 400 + }, + { + "epoch": 0.32028753993610226, + "grad_norm": 0.05943639203906059, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 401 + }, + { + "epoch": 0.3210862619808307, + "grad_norm": 0.10015929490327835, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 402 + }, + { + "epoch": 0.3218849840255591, + "grad_norm": 0.07852698862552643, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 403 + }, + { + "epoch": 0.3226837060702875, + "grad_norm": 0.06103534996509552, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 404 + }, + { + "epoch": 0.323482428115016, + "grad_norm": 0.04573113098740578, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 405 + }, + { + "epoch": 0.3242811501597444, + "grad_norm": 0.06108849495649338, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 406 + }, + { + "epoch": 0.3250798722044728, + "grad_norm": 0.10209841281175613, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 407 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 0.0956021398305893, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 408 + }, + { + "epoch": 0.3266773162939297, + "grad_norm": 0.12572422623634338, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 409 + }, + { + "epoch": 0.3274760383386581, + "grad_norm": 0.1532585173845291, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 410 + }, + { + "epoch": 0.3282747603833866, + "grad_norm": 0.10664337128400803, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 411 + }, + { + "epoch": 0.329073482428115, + "grad_norm": 0.07705336064100266, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 412 + }, + { + "epoch": 0.32987220447284343, + "grad_norm": 0.08611477166414261, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 413 + }, + { + "epoch": 0.3306709265175719, + "grad_norm": 0.11460789293050766, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 414 + }, + { + "epoch": 0.3314696485623003, + "grad_norm": 0.1214505136013031, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 415 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.07482243329286575, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 416 + }, + { + "epoch": 0.3330670926517572, + "grad_norm": 0.05022026225924492, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 417 + }, + { + "epoch": 0.33386581469648563, + "grad_norm": 0.086161769926548, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 418 + }, + { + "epoch": 0.33466453674121405, + "grad_norm": 0.05073339864611626, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 419 + }, + { + "epoch": 0.3354632587859425, + "grad_norm": 0.0925290584564209, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 420 + }, + { + "epoch": 0.33626198083067094, + "grad_norm": 0.08073565363883972, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 421 + }, + { + "epoch": 0.33706070287539935, + "grad_norm": 0.06067343428730965, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 422 + }, + { + "epoch": 0.33785942492012777, + "grad_norm": 0.16081079840660095, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 423 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 0.3043743371963501, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 424 + }, + { + "epoch": 0.33945686900958466, + "grad_norm": 0.32498979568481445, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 425 + }, + { + "epoch": 0.3402555910543131, + "grad_norm": 0.206096351146698, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 426 + }, + { + "epoch": 0.34105431309904155, + "grad_norm": 0.11892937123775482, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 427 + }, + { + "epoch": 0.34185303514376997, + "grad_norm": 0.19896888732910156, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 428 + }, + { + "epoch": 0.3426517571884984, + "grad_norm": 0.3295411169528961, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 429 + }, + { + "epoch": 0.34345047923322686, + "grad_norm": 0.3841599225997925, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 430 + }, + { + "epoch": 0.3442492012779553, + "grad_norm": 0.36113840341567993, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 431 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.25694623589515686, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 432 + }, + { + "epoch": 0.34584664536741216, + "grad_norm": 0.07741750776767731, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 433 + }, + { + "epoch": 0.3466453674121406, + "grad_norm": 0.1385476440191269, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 434 + }, + { + "epoch": 0.347444089456869, + "grad_norm": 0.22972947359085083, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 435 + }, + { + "epoch": 0.34824281150159747, + "grad_norm": 0.15720337629318237, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 436 + }, + { + "epoch": 0.3490415335463259, + "grad_norm": 0.04451138526201248, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 437 + }, + { + "epoch": 0.3498402555910543, + "grad_norm": 0.15054486691951752, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 438 + }, + { + "epoch": 0.3506389776357827, + "grad_norm": 0.16740895807743073, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 439 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 0.1388419270515442, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 440 + }, + { + "epoch": 0.3522364217252396, + "grad_norm": 0.06480700522661209, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 441 + }, + { + "epoch": 0.35303514376996803, + "grad_norm": 0.09604794532060623, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 442 + }, + { + "epoch": 0.3538338658146965, + "grad_norm": 0.174916610121727, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.3546325878594249, + "grad_norm": 0.2228047251701355, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 444 + }, + { + "epoch": 0.35543130990415334, + "grad_norm": 0.24461773037910461, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 445 + }, + { + "epoch": 0.3562300319488818, + "grad_norm": 0.2201017141342163, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 446 + }, + { + "epoch": 0.3570287539936102, + "grad_norm": 0.11596337705850601, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 447 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.1682164967060089, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 448 + }, + { + "epoch": 0.3586261980830671, + "grad_norm": 0.4297041594982147, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 449 + }, + { + "epoch": 0.35942492012779553, + "grad_norm": 0.5659548044204712, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 450 + }, + { + "epoch": 0.36022364217252395, + "grad_norm": 0.5303114652633667, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 451 + }, + { + "epoch": 0.3610223642172524, + "grad_norm": 0.23788955807685852, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 452 + }, + { + "epoch": 0.36182108626198084, + "grad_norm": 0.15622566640377045, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 453 + }, + { + "epoch": 0.36261980830670926, + "grad_norm": 0.327275812625885, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 454 + }, + { + "epoch": 0.3634185303514377, + "grad_norm": 0.23511037230491638, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 455 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 0.11690831184387207, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 456 + }, + { + "epoch": 0.36501597444089456, + "grad_norm": 0.17950886487960815, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 457 + }, + { + "epoch": 0.365814696485623, + "grad_norm": 0.13816051185131073, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 458 + }, + { + "epoch": 0.36661341853035145, + "grad_norm": 0.09056458622217178, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 459 + }, + { + "epoch": 0.36741214057507987, + "grad_norm": 0.1648412048816681, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 460 + }, + { + "epoch": 0.3682108626198083, + "grad_norm": 0.24407249689102173, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 461 + }, + { + "epoch": 0.36900958466453676, + "grad_norm": 0.1896992176771164, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 462 + }, + { + "epoch": 0.3698083067092652, + "grad_norm": 0.07938385009765625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 463 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.10241381078958511, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 464 + }, + { + "epoch": 0.37140575079872207, + "grad_norm": 0.14765797555446625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 465 + }, + { + "epoch": 0.3722044728434505, + "grad_norm": 0.11189796775579453, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 466 + }, + { + "epoch": 0.3730031948881789, + "grad_norm": 0.05604114383459091, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 467 + }, + { + "epoch": 0.3738019169329074, + "grad_norm": 0.18633529543876648, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 468 + }, + { + "epoch": 0.3746006389776358, + "grad_norm": 0.2587120234966278, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 469 + }, + { + "epoch": 0.3753993610223642, + "grad_norm": 0.21629218757152557, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 470 + }, + { + "epoch": 0.3761980830670926, + "grad_norm": 0.11872006952762604, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 471 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 0.07732011377811432, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 472 + }, + { + "epoch": 0.3777955271565495, + "grad_norm": 0.20141537487506866, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 473 + }, + { + "epoch": 0.37859424920127793, + "grad_norm": 0.26726409792900085, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 474 + }, + { + "epoch": 0.3793929712460064, + "grad_norm": 0.2373354583978653, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 475 + }, + { + "epoch": 0.3801916932907348, + "grad_norm": 0.15030571818351746, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 476 + }, + { + "epoch": 0.38099041533546324, + "grad_norm": 0.05345006287097931, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 477 + }, + { + "epoch": 0.3817891373801917, + "grad_norm": 0.12551648914813995, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 478 + }, + { + "epoch": 0.38258785942492013, + "grad_norm": 0.14036186039447784, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 479 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.09807970374822617, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 480 + }, + { + "epoch": 0.384185303514377, + "grad_norm": 0.05071088671684265, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 481 + }, + { + "epoch": 0.38498402555910544, + "grad_norm": 0.07541649043560028, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 482 + }, + { + "epoch": 0.38578274760383385, + "grad_norm": 0.059762127697467804, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 483 + }, + { + "epoch": 0.3865814696485623, + "grad_norm": 0.05540496110916138, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 484 + }, + { + "epoch": 0.38738019169329074, + "grad_norm": 0.09137953072786331, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 485 + }, + { + "epoch": 0.38817891373801916, + "grad_norm": 0.1349237710237503, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 486 + }, + { + "epoch": 0.3889776357827476, + "grad_norm": 0.13889296352863312, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 487 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 0.16406965255737305, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 488 + }, + { + "epoch": 0.39057507987220447, + "grad_norm": 0.1748959869146347, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 489 + }, + { + "epoch": 0.3913738019169329, + "grad_norm": 0.1518068015575409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 490 + }, + { + "epoch": 0.39217252396166136, + "grad_norm": 0.06694433838129044, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 491 + }, + { + "epoch": 0.3929712460063898, + "grad_norm": 0.11556574702262878, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 492 + }, + { + "epoch": 0.3937699680511182, + "grad_norm": 0.2562897801399231, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 493 + }, + { + "epoch": 0.39456869009584666, + "grad_norm": 0.30842337012290955, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 494 + }, + { + "epoch": 0.3953674121405751, + "grad_norm": 0.30477815866470337, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 495 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.2602941691875458, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 496 + }, + { + "epoch": 0.39696485623003197, + "grad_norm": 0.1692838817834854, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 497 + }, + { + "epoch": 0.3977635782747604, + "grad_norm": 0.07468903064727783, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 498 + }, + { + "epoch": 0.3985623003194888, + "grad_norm": 0.05872616916894913, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 499 + }, + { + "epoch": 0.3993610223642173, + "grad_norm": 0.09878433495759964, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 500 + }, + { + "epoch": 0.4001597444089457, + "grad_norm": 0.13779069483280182, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 501 + }, + { + "epoch": 0.4009584664536741, + "grad_norm": 0.17778213322162628, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 502 + }, + { + "epoch": 0.40175718849840253, + "grad_norm": 0.15572750568389893, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 503 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 0.1154002770781517, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 504 + }, + { + "epoch": 0.4033546325878594, + "grad_norm": 0.04485362395644188, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 505 + }, + { + "epoch": 0.40415335463258784, + "grad_norm": 0.07514321058988571, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 506 + }, + { + "epoch": 0.4049520766773163, + "grad_norm": 0.13954220712184906, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 507 + }, + { + "epoch": 0.4057507987220447, + "grad_norm": 0.20726922154426575, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 508 + }, + { + "epoch": 0.40654952076677314, + "grad_norm": 0.28239160776138306, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 509 + }, + { + "epoch": 0.4073482428115016, + "grad_norm": 0.28484129905700684, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 510 + }, + { + "epoch": 0.40814696485623003, + "grad_norm": 0.28111377358436584, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 511 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.25087496638298035, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 512 + }, + { + "epoch": 0.4097444089456869, + "grad_norm": 0.1652008444070816, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 513 + }, + { + "epoch": 0.41054313099041534, + "grad_norm": 0.11345700174570084, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 514 + }, + { + "epoch": 0.41134185303514376, + "grad_norm": 0.1191159337759018, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 515 + }, + { + "epoch": 0.41214057507987223, + "grad_norm": 0.26302817463874817, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 516 + }, + { + "epoch": 0.41293929712460065, + "grad_norm": 0.3303217589855194, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 517 + }, + { + "epoch": 0.41373801916932906, + "grad_norm": 0.2874647378921509, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 518 + }, + { + "epoch": 0.4145367412140575, + "grad_norm": 0.23112182319164276, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 519 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 0.16285021603107452, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 520 + }, + { + "epoch": 0.41613418530351437, + "grad_norm": 0.08440099656581879, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 521 + }, + { + "epoch": 0.4169329073482428, + "grad_norm": 0.03578028455376625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 522 + }, + { + "epoch": 0.41773162939297126, + "grad_norm": 0.0995275005698204, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 523 + }, + { + "epoch": 0.4185303514376997, + "grad_norm": 0.17713160812854767, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 524 + }, + { + "epoch": 0.4193290734824281, + "grad_norm": 0.1685509830713272, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 525 + }, + { + "epoch": 0.42012779552715657, + "grad_norm": 0.11357919126749039, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 526 + }, + { + "epoch": 0.420926517571885, + "grad_norm": 0.059025365859270096, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 527 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.05128806456923485, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 528 + }, + { + "epoch": 0.4225239616613419, + "grad_norm": 0.05291247367858887, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 529 + }, + { + "epoch": 0.4233226837060703, + "grad_norm": 0.10755500197410583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 530 + }, + { + "epoch": 0.4241214057507987, + "grad_norm": 0.15659615397453308, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 531 + }, + { + "epoch": 0.4249201277955272, + "grad_norm": 0.19369953870773315, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 532 + }, + { + "epoch": 0.4257188498402556, + "grad_norm": 0.16491396725177765, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 533 + }, + { + "epoch": 0.426517571884984, + "grad_norm": 0.10276799649000168, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 534 + }, + { + "epoch": 0.4273162939297125, + "grad_norm": 0.06273368000984192, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 535 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 0.03896406292915344, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 536 + }, + { + "epoch": 0.4289137380191693, + "grad_norm": 0.08083273470401764, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 537 + }, + { + "epoch": 0.42971246006389774, + "grad_norm": 0.05107828602194786, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 538 + }, + { + "epoch": 0.4305111821086262, + "grad_norm": 0.04359392821788788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 539 + }, + { + "epoch": 0.43130990415335463, + "grad_norm": 0.04225402697920799, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 540 + }, + { + "epoch": 0.43210862619808305, + "grad_norm": 0.07523404061794281, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 541 + }, + { + "epoch": 0.4329073482428115, + "grad_norm": 0.07966417819261551, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 542 + }, + { + "epoch": 0.43370607028753994, + "grad_norm": 0.04529299959540367, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 543 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.0793156549334526, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 544 + }, + { + "epoch": 0.4353035143769968, + "grad_norm": 0.1533992737531662, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 545 + }, + { + "epoch": 0.43610223642172524, + "grad_norm": 0.2893797755241394, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 546 + }, + { + "epoch": 0.43690095846645366, + "grad_norm": 0.4145842492580414, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 547 + }, + { + "epoch": 0.43769968051118213, + "grad_norm": 0.4550987482070923, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 548 + }, + { + "epoch": 0.43849840255591055, + "grad_norm": 0.4318651556968689, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 549 + }, + { + "epoch": 0.43929712460063897, + "grad_norm": 0.35961681604385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 550 + }, + { + "epoch": 0.44009584664536744, + "grad_norm": 0.18606753647327423, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 551 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 0.12992478907108307, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 552 + }, + { + "epoch": 0.4416932907348243, + "grad_norm": 0.32936930656433105, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 553 + }, + { + "epoch": 0.4424920127795527, + "grad_norm": 0.3547491133213043, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 554 + }, + { + "epoch": 0.44329073482428116, + "grad_norm": 0.2144627720117569, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 555 + }, + { + "epoch": 0.4440894568690096, + "grad_norm": 0.07260395586490631, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 556 + }, + { + "epoch": 0.444888178913738, + "grad_norm": 0.19895662367343903, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 557 + }, + { + "epoch": 0.44568690095846647, + "grad_norm": 0.18664990365505219, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 558 + }, + { + "epoch": 0.4464856230031949, + "grad_norm": 0.11666610836982727, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 559 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.11163592338562012, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 560 + }, + { + "epoch": 0.4480830670926518, + "grad_norm": 0.1815878301858902, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 561 + }, + { + "epoch": 0.4488817891373802, + "grad_norm": 0.2593924105167389, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 562 + }, + { + "epoch": 0.4496805111821086, + "grad_norm": 0.20761220157146454, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 563 + }, + { + "epoch": 0.4504792332268371, + "grad_norm": 0.06589766591787338, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 564 + }, + { + "epoch": 0.4512779552715655, + "grad_norm": 0.21619920432567596, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 565 + }, + { + "epoch": 0.4520766773162939, + "grad_norm": 0.2392708659172058, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 566 + }, + { + "epoch": 0.4528753993610224, + "grad_norm": 0.23214633762836456, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 567 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 0.263883501291275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 568 + }, + { + "epoch": 0.4544728434504792, + "grad_norm": 0.19914190471172333, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 569 + }, + { + "epoch": 0.45527156549520764, + "grad_norm": 0.11453433334827423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 570 + }, + { + "epoch": 0.4560702875399361, + "grad_norm": 0.15091221034526825, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 571 + }, + { + "epoch": 0.45686900958466453, + "grad_norm": 0.043582383543252945, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 572 + }, + { + "epoch": 0.45766773162939295, + "grad_norm": 0.14068740606307983, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 573 + }, + { + "epoch": 0.4584664536741214, + "grad_norm": 0.1274290233850479, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 574 + }, + { + "epoch": 0.45926517571884984, + "grad_norm": 0.13504599034786224, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 575 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.1267779916524887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 576 + }, + { + "epoch": 0.46086261980830673, + "grad_norm": 0.08138085901737213, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 577 + }, + { + "epoch": 0.46166134185303515, + "grad_norm": 0.07772356271743774, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 578 + }, + { + "epoch": 0.46246006389776356, + "grad_norm": 0.06863631308078766, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 579 + }, + { + "epoch": 0.46325878594249204, + "grad_norm": 0.1232575923204422, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 580 + }, + { + "epoch": 0.46405750798722045, + "grad_norm": 0.179134801030159, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 581 + }, + { + "epoch": 0.46485623003194887, + "grad_norm": 0.20545582473278046, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 582 + }, + { + "epoch": 0.46565495207667734, + "grad_norm": 0.14182575047016144, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 583 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 0.05813328176736832, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 584 + }, + { + "epoch": 0.4672523961661342, + "grad_norm": 0.1530984789133072, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 585 + }, + { + "epoch": 0.4680511182108626, + "grad_norm": 0.2820036709308624, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 586 + }, + { + "epoch": 0.46884984025559107, + "grad_norm": 0.39252954721450806, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 587 + }, + { + "epoch": 0.4696485623003195, + "grad_norm": 0.40830549597740173, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 588 + }, + { + "epoch": 0.4704472843450479, + "grad_norm": 0.2846182882785797, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 589 + }, + { + "epoch": 0.4712460063897764, + "grad_norm": 0.06798163801431656, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 590 + }, + { + "epoch": 0.4720447284345048, + "grad_norm": 0.18650950491428375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 591 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.2965260446071625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 592 + }, + { + "epoch": 0.4736421725239617, + "grad_norm": 0.24504852294921875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 593 + }, + { + "epoch": 0.4744408945686901, + "grad_norm": 0.11336984485387802, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 594 + }, + { + "epoch": 0.4752396166134185, + "grad_norm": 0.09007567912340164, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 595 + }, + { + "epoch": 0.476038338658147, + "grad_norm": 0.225834459066391, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 596 + }, + { + "epoch": 0.4768370607028754, + "grad_norm": 0.2679842710494995, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 597 + }, + { + "epoch": 0.4776357827476038, + "grad_norm": 0.1801901012659073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 598 + }, + { + "epoch": 0.4784345047923323, + "grad_norm": 0.09554167836904526, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 599 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 0.046632468700408936, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 600 + }, + { + "epoch": 0.48003194888178913, + "grad_norm": 0.12078758329153061, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 601 + }, + { + "epoch": 0.48083067092651754, + "grad_norm": 0.12126865237951279, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 602 + }, + { + "epoch": 0.481629392971246, + "grad_norm": 0.14078640937805176, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 603 + }, + { + "epoch": 0.48242811501597443, + "grad_norm": 0.18556037545204163, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 604 + }, + { + "epoch": 0.48322683706070285, + "grad_norm": 0.178151473402977, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 605 + }, + { + "epoch": 0.4840255591054313, + "grad_norm": 0.1672516018152237, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 606 + }, + { + "epoch": 0.48482428115015974, + "grad_norm": 0.11648737639188766, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 607 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.11820051819086075, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 608 + }, + { + "epoch": 0.48642172523961663, + "grad_norm": 0.21110932528972626, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 609 + }, + { + "epoch": 0.48722044728434505, + "grad_norm": 0.24852754175662994, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 610 + }, + { + "epoch": 0.48801916932907347, + "grad_norm": 0.2633175551891327, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 611 + }, + { + "epoch": 0.48881789137380194, + "grad_norm": 0.21904303133487701, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 612 + }, + { + "epoch": 0.48961661341853036, + "grad_norm": 0.07822466641664505, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 613 + }, + { + "epoch": 0.4904153354632588, + "grad_norm": 0.0767827108502388, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 614 + }, + { + "epoch": 0.49121405750798725, + "grad_norm": 0.07943699508905411, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 615 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 0.055741772055625916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 616 + }, + { + "epoch": 0.4928115015974441, + "grad_norm": 0.10400068014860153, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 617 + }, + { + "epoch": 0.4936102236421725, + "grad_norm": 0.05080602690577507, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 618 + }, + { + "epoch": 0.49440894568690097, + "grad_norm": 0.07927533984184265, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 619 + }, + { + "epoch": 0.4952076677316294, + "grad_norm": 0.07919944822788239, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 620 + }, + { + "epoch": 0.4960063897763578, + "grad_norm": 0.11013699322938919, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 621 + }, + { + "epoch": 0.4968051118210863, + "grad_norm": 0.16232389211654663, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 622 + }, + { + "epoch": 0.4976038338658147, + "grad_norm": 0.17625346779823303, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 623 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.1681327521800995, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 624 + }, + { + "epoch": 0.4992012779552716, + "grad_norm": 0.1882159262895584, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 625 + }, + { + "epoch": 0.5, + "grad_norm": 0.21075129508972168, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 626 + }, + { + "epoch": 0.5007987220447284, + "grad_norm": 0.1464296281337738, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 627 + }, + { + "epoch": 0.5015974440894568, + "grad_norm": 0.11155212670564651, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 628 + }, + { + "epoch": 0.5023961661341853, + "grad_norm": 0.09794416278600693, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 629 + }, + { + "epoch": 0.5031948881789138, + "grad_norm": 0.12095183879137039, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 630 + }, + { + "epoch": 0.5039936102236422, + "grad_norm": 0.1933794617652893, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 631 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 0.32272887229919434, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 632 + }, + { + "epoch": 0.505591054313099, + "grad_norm": 0.2507671117782593, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 633 + }, + { + "epoch": 0.5063897763578274, + "grad_norm": 0.09540661424398422, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 634 + }, + { + "epoch": 0.5071884984025559, + "grad_norm": 0.07341819256544113, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 635 + }, + { + "epoch": 0.5079872204472844, + "grad_norm": 0.11610874533653259, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 636 + }, + { + "epoch": 0.5087859424920128, + "grad_norm": 0.1338607519865036, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 637 + }, + { + "epoch": 0.5095846645367412, + "grad_norm": 0.07892445474863052, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 638 + }, + { + "epoch": 0.5103833865814696, + "grad_norm": 0.053661834448575974, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 639 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.06852453202009201, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 640 + }, + { + "epoch": 0.5119808306709265, + "grad_norm": 0.045109208673238754, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 641 + }, + { + "epoch": 0.512779552715655, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 642 + }, + { + "epoch": 0.5135782747603834, + "grad_norm": 0.05903350189328194, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 643 + }, + { + "epoch": 0.5143769968051118, + "grad_norm": 0.07314767688512802, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 644 + }, + { + "epoch": 0.5151757188498403, + "grad_norm": 0.12484236806631088, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 645 + }, + { + "epoch": 0.5159744408945687, + "grad_norm": 0.15683352947235107, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 646 + }, + { + "epoch": 0.5167731629392971, + "grad_norm": 0.13519413769245148, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 647 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 0.10333485156297684, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 648 + }, + { + "epoch": 0.518370607028754, + "grad_norm": 0.09626923501491547, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 649 + }, + { + "epoch": 0.5191693290734825, + "grad_norm": 0.08177447319030762, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 650 + }, + { + "epoch": 0.5199680511182109, + "grad_norm": 0.04186684265732765, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 651 + }, + { + "epoch": 0.5207667731629393, + "grad_norm": 0.07705547660589218, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 652 + }, + { + "epoch": 0.5215654952076677, + "grad_norm": 0.05885700136423111, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 653 + }, + { + "epoch": 0.5223642172523961, + "grad_norm": 0.14140211045742035, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 654 + }, + { + "epoch": 0.5231629392971247, + "grad_norm": 0.18797138333320618, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 655 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.2301982045173645, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 656 + }, + { + "epoch": 0.5247603833865815, + "grad_norm": 0.2813114523887634, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 657 + }, + { + "epoch": 0.5255591054313099, + "grad_norm": 0.3205592930316925, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 658 + }, + { + "epoch": 0.5263578274760383, + "grad_norm": 0.3426150381565094, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 659 + }, + { + "epoch": 0.5271565495207667, + "grad_norm": 0.2636663615703583, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 660 + }, + { + "epoch": 0.5279552715654952, + "grad_norm": 0.14799079298973083, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 661 + }, + { + "epoch": 0.5287539936102237, + "grad_norm": 0.06354992836713791, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 662 + }, + { + "epoch": 0.5295527156549521, + "grad_norm": 0.239300936460495, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 663 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 0.33535388112068176, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 664 + }, + { + "epoch": 0.5311501597444089, + "grad_norm": 0.32471078634262085, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 665 + }, + { + "epoch": 0.5319488817891374, + "grad_norm": 0.2491266429424286, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 666 + }, + { + "epoch": 0.5327476038338658, + "grad_norm": 0.09841614216566086, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 667 + }, + { + "epoch": 0.5335463258785943, + "grad_norm": 0.1310579627752304, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 668 + }, + { + "epoch": 0.5343450479233227, + "grad_norm": 0.28287971019744873, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 669 + }, + { + "epoch": 0.5351437699680511, + "grad_norm": 0.3457719385623932, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 670 + }, + { + "epoch": 0.5359424920127795, + "grad_norm": 0.31690946221351624, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 671 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.19356760382652283, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 672 + }, + { + "epoch": 0.5375399361022364, + "grad_norm": 0.05940595269203186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 673 + }, + { + "epoch": 0.5383386581469649, + "grad_norm": 0.20772181451320648, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 674 + }, + { + "epoch": 0.5391373801916933, + "grad_norm": 0.3093980848789215, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 675 + }, + { + "epoch": 0.5399361022364217, + "grad_norm": 0.2632107734680176, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 676 + }, + { + "epoch": 0.5407348242811502, + "grad_norm": 0.12365782260894775, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 677 + }, + { + "epoch": 0.5415335463258786, + "grad_norm": 0.07215466350317001, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 678 + }, + { + "epoch": 0.542332268370607, + "grad_norm": 0.16745947301387787, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 679 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 0.14418186247348785, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 680 + }, + { + "epoch": 0.5439297124600639, + "grad_norm": 0.048094023019075394, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 681 + }, + { + "epoch": 0.5447284345047924, + "grad_norm": 0.10100048035383224, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 682 + }, + { + "epoch": 0.5455271565495208, + "grad_norm": 0.13719545304775238, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 683 + }, + { + "epoch": 0.5463258785942492, + "grad_norm": 0.16066808998584747, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 684 + }, + { + "epoch": 0.5471246006389776, + "grad_norm": 0.19201414287090302, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 685 + }, + { + "epoch": 0.547923322683706, + "grad_norm": 0.19783100485801697, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 686 + }, + { + "epoch": 0.5487220447284346, + "grad_norm": 0.1431797295808792, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 687 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.04368956387042999, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 688 + }, + { + "epoch": 0.5503194888178914, + "grad_norm": 0.12395253777503967, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 689 + }, + { + "epoch": 0.5511182108626198, + "grad_norm": 0.16278770565986633, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 690 + }, + { + "epoch": 0.5519169329073482, + "grad_norm": 0.15368889272212982, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 691 + }, + { + "epoch": 0.5527156549520766, + "grad_norm": 0.10195931792259216, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 692 + }, + { + "epoch": 0.5535143769968051, + "grad_norm": 0.03421236202120781, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 693 + }, + { + "epoch": 0.5543130990415336, + "grad_norm": 0.09549148380756378, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 694 + }, + { + "epoch": 0.555111821086262, + "grad_norm": 0.17825989425182343, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 695 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 0.25296247005462646, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 696 + }, + { + "epoch": 0.5567092651757188, + "grad_norm": 0.27566400170326233, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 697 + }, + { + "epoch": 0.5575079872204473, + "grad_norm": 0.22609780728816986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 698 + }, + { + "epoch": 0.5583067092651757, + "grad_norm": 0.10555832833051682, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 699 + }, + { + "epoch": 0.5591054313099042, + "grad_norm": 0.1309640258550644, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 700 + }, + { + "epoch": 0.5599041533546326, + "grad_norm": 0.3434476852416992, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 701 + }, + { + "epoch": 0.560702875399361, + "grad_norm": 0.4559882581233978, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 702 + }, + { + "epoch": 0.5615015974440895, + "grad_norm": 0.390683650970459, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 703 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.14178164303302765, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 704 + }, + { + "epoch": 0.5630990415335463, + "grad_norm": 0.19113974273204803, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 705 + }, + { + "epoch": 0.5638977635782748, + "grad_norm": 0.38376086950302124, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 706 + }, + { + "epoch": 0.5646964856230032, + "grad_norm": 0.3486707806587219, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 707 + }, + { + "epoch": 0.5654952076677316, + "grad_norm": 0.14712302386760712, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 708 + }, + { + "epoch": 0.5662939297124601, + "grad_norm": 0.11827494204044342, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 709 + }, + { + "epoch": 0.5670926517571885, + "grad_norm": 0.27573689818382263, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 710 + }, + { + "epoch": 0.5678913738019169, + "grad_norm": 0.2983379065990448, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 711 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 0.2019582986831665, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 712 + }, + { + "epoch": 0.5694888178913738, + "grad_norm": 0.04186725243926048, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 713 + }, + { + "epoch": 0.5702875399361023, + "grad_norm": 0.16714231669902802, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 714 + }, + { + "epoch": 0.5710862619808307, + "grad_norm": 0.24982011318206787, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 715 + }, + { + "epoch": 0.5718849840255591, + "grad_norm": 0.22021397948265076, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 716 + }, + { + "epoch": 0.5726837060702875, + "grad_norm": 0.09717470407485962, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 717 + }, + { + "epoch": 0.5734824281150159, + "grad_norm": 0.10214962065219879, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 718 + }, + { + "epoch": 0.5742811501597445, + "grad_norm": 0.15325960516929626, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 719 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.11207877099514008, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 720 + }, + { + "epoch": 0.5758785942492013, + "grad_norm": 0.05425047129392624, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 721 + }, + { + "epoch": 0.5766773162939297, + "grad_norm": 0.0703732892870903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 722 + }, + { + "epoch": 0.5774760383386581, + "grad_norm": 0.10577918589115143, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 723 + }, + { + "epoch": 0.5782747603833865, + "grad_norm": 0.13230514526367188, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 724 + }, + { + "epoch": 0.579073482428115, + "grad_norm": 0.1878778040409088, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 725 + }, + { + "epoch": 0.5798722044728435, + "grad_norm": 0.19956567883491516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 726 + }, + { + "epoch": 0.5806709265175719, + "grad_norm": 0.13732020556926727, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 727 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 0.09844338148832321, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 728 + }, + { + "epoch": 0.5822683706070287, + "grad_norm": 0.056577637791633606, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 729 + }, + { + "epoch": 0.5830670926517572, + "grad_norm": 0.0835585743188858, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 730 + }, + { + "epoch": 0.5838658146964856, + "grad_norm": 0.0910082757472992, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 731 + }, + { + "epoch": 0.5846645367412141, + "grad_norm": 0.0659257099032402, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 732 + }, + { + "epoch": 0.5854632587859425, + "grad_norm": 0.09342535585165024, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 733 + }, + { + "epoch": 0.5862619808306709, + "grad_norm": 0.0627603679895401, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 734 + }, + { + "epoch": 0.5870607028753994, + "grad_norm": 0.10535050183534622, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 735 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.13628117740154266, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 736 + }, + { + "epoch": 0.5886581469648562, + "grad_norm": 0.0715300589799881, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 737 + }, + { + "epoch": 0.5894568690095847, + "grad_norm": 0.10892884433269501, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 738 + }, + { + "epoch": 0.5902555910543131, + "grad_norm": 0.09805259853601456, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 739 + }, + { + "epoch": 0.5910543130990416, + "grad_norm": 0.14491751790046692, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 740 + }, + { + "epoch": 0.59185303514377, + "grad_norm": 0.15448585152626038, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 741 + }, + { + "epoch": 0.5926517571884984, + "grad_norm": 0.08218494802713394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 742 + }, + { + "epoch": 0.5934504792332268, + "grad_norm": 0.16311237215995789, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 743 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 0.10310494899749756, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 744 + }, + { + "epoch": 0.5950479233226837, + "grad_norm": 0.1511978805065155, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 745 + }, + { + "epoch": 0.5958466453674122, + "grad_norm": 0.20440778136253357, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 746 + }, + { + "epoch": 0.5966453674121406, + "grad_norm": 0.20918506383895874, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 747 + }, + { + "epoch": 0.597444089456869, + "grad_norm": 0.20070627331733704, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 748 + }, + { + "epoch": 0.5982428115015974, + "grad_norm": 0.1142180860042572, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 749 + }, + { + "epoch": 0.5990415335463258, + "grad_norm": 0.09418357163667679, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 750 + }, + { + "epoch": 0.5998402555910544, + "grad_norm": 0.24306562542915344, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 751 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.3208121955394745, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 752 + }, + { + "epoch": 0.6014376996805112, + "grad_norm": 0.3070276081562042, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 753 + }, + { + "epoch": 0.6022364217252396, + "grad_norm": 0.17130877077579498, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 754 + }, + { + "epoch": 0.603035143769968, + "grad_norm": 0.0733935534954071, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 755 + }, + { + "epoch": 0.6038338658146964, + "grad_norm": 0.25525134801864624, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 756 + }, + { + "epoch": 0.604632587859425, + "grad_norm": 0.39397957921028137, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 757 + }, + { + "epoch": 0.6054313099041534, + "grad_norm": 0.39015471935272217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 758 + }, + { + "epoch": 0.6062300319488818, + "grad_norm": 0.1757609099149704, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 759 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 0.19901637732982635, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 760 + }, + { + "epoch": 0.6078274760383386, + "grad_norm": 0.46885979175567627, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 761 + }, + { + "epoch": 0.6086261980830671, + "grad_norm": 0.4650067687034607, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 762 + }, + { + "epoch": 0.6094249201277955, + "grad_norm": 0.16624194383621216, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 763 + }, + { + "epoch": 0.610223642172524, + "grad_norm": 0.23347698152065277, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 764 + }, + { + "epoch": 0.6110223642172524, + "grad_norm": 0.40192991495132446, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 765 + }, + { + "epoch": 0.6118210862619808, + "grad_norm": 0.33640867471694946, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 766 + }, + { + "epoch": 0.6126198083067093, + "grad_norm": 0.11979667842388153, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 767 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.17994286119937897, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 768 + }, + { + "epoch": 0.6142172523961661, + "grad_norm": 0.2693847715854645, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 769 + }, + { + "epoch": 0.6150159744408946, + "grad_norm": 0.2041584849357605, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 770 + }, + { + "epoch": 0.615814696485623, + "grad_norm": 0.052040908485651016, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 771 + }, + { + "epoch": 0.6166134185303515, + "grad_norm": 0.18652868270874023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 772 + }, + { + "epoch": 0.6174121405750799, + "grad_norm": 0.26122182607650757, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 773 + }, + { + "epoch": 0.6182108626198083, + "grad_norm": 0.15385891497135162, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 774 + }, + { + "epoch": 0.6190095846645367, + "grad_norm": 0.09217085689306259, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 775 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 0.23316404223442078, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 776 + }, + { + "epoch": 0.6206070287539937, + "grad_norm": 0.24094274640083313, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 777 + }, + { + "epoch": 0.6214057507987221, + "grad_norm": 0.08518059551715851, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 778 + }, + { + "epoch": 0.6222044728434505, + "grad_norm": 0.11076594144105911, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 779 + }, + { + "epoch": 0.6230031948881789, + "grad_norm": 0.1963978409767151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 780 + }, + { + "epoch": 0.6238019169329073, + "grad_norm": 0.1526973396539688, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 781 + }, + { + "epoch": 0.6246006389776357, + "grad_norm": 0.09434971958398819, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 782 + }, + { + "epoch": 0.6253993610223643, + "grad_norm": 0.2677021622657776, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 783 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.2885434329509735, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 784 + }, + { + "epoch": 0.6269968051118211, + "grad_norm": 0.14111816883087158, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 785 + }, + { + "epoch": 0.6277955271565495, + "grad_norm": 0.06594719737768173, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 786 + }, + { + "epoch": 0.6285942492012779, + "grad_norm": 0.09837283194065094, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 787 + }, + { + "epoch": 0.6293929712460063, + "grad_norm": 0.06089933589100838, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 788 + }, + { + "epoch": 0.6301916932907349, + "grad_norm": 0.16248181462287903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 789 + }, + { + "epoch": 0.6309904153354633, + "grad_norm": 0.298454612493515, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 790 + }, + { + "epoch": 0.6317891373801917, + "grad_norm": 0.3365437090396881, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 791 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 0.22858452796936035, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 792 + }, + { + "epoch": 0.6333865814696485, + "grad_norm": 0.04849984869360924, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 793 + }, + { + "epoch": 0.634185303514377, + "grad_norm": 0.24791331589221954, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 794 + }, + { + "epoch": 0.6349840255591054, + "grad_norm": 0.3028055727481842, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 795 + }, + { + "epoch": 0.6357827476038339, + "grad_norm": 0.15674540400505066, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 796 + }, + { + "epoch": 0.6365814696485623, + "grad_norm": 0.08521793782711029, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 797 + }, + { + "epoch": 0.6373801916932907, + "grad_norm": 0.21750952303409576, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 798 + }, + { + "epoch": 0.6381789137380192, + "grad_norm": 0.18880338966846466, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 799 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.06699419766664505, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 800 + }, + { + "epoch": 0.639776357827476, + "grad_norm": 0.08062998205423355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 801 + }, + { + "epoch": 0.6405750798722045, + "grad_norm": 0.10635658353567123, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 802 + }, + { + "epoch": 0.6413738019169329, + "grad_norm": 0.05086763948202133, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 803 + }, + { + "epoch": 0.6421725239616614, + "grad_norm": 0.09852107614278793, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 804 + }, + { + "epoch": 0.6429712460063898, + "grad_norm": 0.11290771514177322, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 805 + }, + { + "epoch": 0.6437699680511182, + "grad_norm": 0.15106825530529022, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 806 + }, + { + "epoch": 0.6445686900958466, + "grad_norm": 0.13646326959133148, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 807 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 0.06398668140172958, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 808 + }, + { + "epoch": 0.6461661341853036, + "grad_norm": 0.11581127345561981, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 809 + }, + { + "epoch": 0.646964856230032, + "grad_norm": 0.15684139728546143, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 810 + }, + { + "epoch": 0.6477635782747604, + "grad_norm": 0.14094121754169464, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 811 + }, + { + "epoch": 0.6485623003194888, + "grad_norm": 0.0938766822218895, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 812 + }, + { + "epoch": 0.6493610223642172, + "grad_norm": 0.06041521951556206, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 813 + }, + { + "epoch": 0.6501597444089456, + "grad_norm": 0.13364291191101074, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 814 + }, + { + "epoch": 0.6509584664536742, + "grad_norm": 0.15577054023742676, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 815 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.1119854673743248, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 816 + }, + { + "epoch": 0.652555910543131, + "grad_norm": 0.07751357555389404, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 817 + }, + { + "epoch": 0.6533546325878594, + "grad_norm": 0.10110143572092056, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 818 + }, + { + "epoch": 0.6541533546325878, + "grad_norm": 0.19627511501312256, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 819 + }, + { + "epoch": 0.6549520766773163, + "grad_norm": 0.19837769865989685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 820 + }, + { + "epoch": 0.6557507987220448, + "grad_norm": 0.13598690927028656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 821 + }, + { + "epoch": 0.6565495207667732, + "grad_norm": 0.05950666591525078, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 822 + }, + { + "epoch": 0.6573482428115016, + "grad_norm": 0.060314662754535675, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 823 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 0.11455138027667999, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 824 + }, + { + "epoch": 0.6589456869009584, + "grad_norm": 0.16753345727920532, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 825 + }, + { + "epoch": 0.6597444089456869, + "grad_norm": 0.15707428753376007, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 826 + }, + { + "epoch": 0.6605431309904153, + "grad_norm": 0.07224153727293015, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 827 + }, + { + "epoch": 0.6613418530351438, + "grad_norm": 0.10538042336702347, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 828 + }, + { + "epoch": 0.6621405750798722, + "grad_norm": 0.18855130672454834, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 829 + }, + { + "epoch": 0.6629392971246006, + "grad_norm": 0.17752179503440857, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 830 + }, + { + "epoch": 0.6637380191693291, + "grad_norm": 0.10109171271324158, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 831 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.15006190538406372, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 832 + }, + { + "epoch": 0.6653354632587859, + "grad_norm": 0.2701014578342438, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 833 + }, + { + "epoch": 0.6661341853035144, + "grad_norm": 0.2607312500476837, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 834 + }, + { + "epoch": 0.6669329073482428, + "grad_norm": 0.19712841510772705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 835 + }, + { + "epoch": 0.6677316293929713, + "grad_norm": 0.0839366614818573, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 836 + }, + { + "epoch": 0.6685303514376997, + "grad_norm": 0.1595088541507721, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 837 + }, + { + "epoch": 0.6693290734824281, + "grad_norm": 0.2773466408252716, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 838 + }, + { + "epoch": 0.6701277955271565, + "grad_norm": 0.24616314470767975, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 839 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 0.15596427023410797, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 840 + }, + { + "epoch": 0.6717252396166135, + "grad_norm": 0.047822993248701096, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 841 + }, + { + "epoch": 0.6725239616613419, + "grad_norm": 0.17692670226097107, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 842 + }, + { + "epoch": 0.6733226837060703, + "grad_norm": 0.1742856502532959, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 843 + }, + { + "epoch": 0.6741214057507987, + "grad_norm": 0.15347127616405487, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 844 + }, + { + "epoch": 0.6749201277955271, + "grad_norm": 0.18238374590873718, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 845 + }, + { + "epoch": 0.6757188498402555, + "grad_norm": 0.1524323672056198, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 846 + }, + { + "epoch": 0.6765175718849841, + "grad_norm": 0.1820068210363388, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 847 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.2010941058397293, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 848 + }, + { + "epoch": 0.6781150159744409, + "grad_norm": 0.16428111493587494, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 849 + }, + { + "epoch": 0.6789137380191693, + "grad_norm": 0.1538572460412979, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 850 + }, + { + "epoch": 0.6797124600638977, + "grad_norm": 0.057427916675806046, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 851 + }, + { + "epoch": 0.6805111821086262, + "grad_norm": 0.08329081535339355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 852 + }, + { + "epoch": 0.6813099041533547, + "grad_norm": 0.05685174837708473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 853 + }, + { + "epoch": 0.6821086261980831, + "grad_norm": 0.15277032554149628, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 854 + }, + { + "epoch": 0.6829073482428115, + "grad_norm": 0.24243640899658203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 855 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 0.28722453117370605, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 856 + }, + { + "epoch": 0.6845047923322684, + "grad_norm": 0.1997309774160385, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 857 + }, + { + "epoch": 0.6853035143769968, + "grad_norm": 0.061719026416540146, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 858 + }, + { + "epoch": 0.6861022364217252, + "grad_norm": 0.23425672948360443, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 859 + }, + { + "epoch": 0.6869009584664537, + "grad_norm": 0.350109726190567, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 860 + }, + { + "epoch": 0.6876996805111821, + "grad_norm": 0.34444838762283325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 861 + }, + { + "epoch": 0.6884984025559105, + "grad_norm": 0.15325413644313812, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 862 + }, + { + "epoch": 0.689297124600639, + "grad_norm": 0.1227702870965004, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 863 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 0.24337291717529297, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 864 + }, + { + "epoch": 0.6908945686900958, + "grad_norm": 0.24047589302062988, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 865 + }, + { + "epoch": 0.6916932907348243, + "grad_norm": 0.13576050102710724, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 866 + }, + { + "epoch": 0.6924920127795527, + "grad_norm": 0.0503714494407177, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 867 + }, + { + "epoch": 0.6932907348242812, + "grad_norm": 0.1292860060930252, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 868 + }, + { + "epoch": 0.6940894568690096, + "grad_norm": 0.14698486030101776, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 869 + }, + { + "epoch": 0.694888178913738, + "grad_norm": 0.07720573991537094, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 870 + }, + { + "epoch": 0.6956869009584664, + "grad_norm": 0.1604471504688263, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 871 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 0.32734861969947815, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 872 + }, + { + "epoch": 0.6972843450479234, + "grad_norm": 0.32366684079170227, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 873 + }, + { + "epoch": 0.6980830670926518, + "grad_norm": 0.18428802490234375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 874 + }, + { + "epoch": 0.6988817891373802, + "grad_norm": 0.07498858869075775, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 875 + }, + { + "epoch": 0.6996805111821086, + "grad_norm": 0.24449816346168518, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 876 + }, + { + "epoch": 0.700479233226837, + "grad_norm": 0.26649829745292664, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 877 + }, + { + "epoch": 0.7012779552715654, + "grad_norm": 0.1315024197101593, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 878 + }, + { + "epoch": 0.702076677316294, + "grad_norm": 0.10907325148582458, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 879 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.2364589273929596, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 880 + }, + { + "epoch": 0.7036741214057508, + "grad_norm": 0.1663885861635208, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 881 + }, + { + "epoch": 0.7044728434504792, + "grad_norm": 0.0596470907330513, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 882 + }, + { + "epoch": 0.7052715654952076, + "grad_norm": 0.1519233137369156, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 883 + }, + { + "epoch": 0.7060702875399361, + "grad_norm": 0.23089520633220673, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 884 + }, + { + "epoch": 0.7068690095846646, + "grad_norm": 0.20667214691638947, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 885 + }, + { + "epoch": 0.707667731629393, + "grad_norm": 0.10739922523498535, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 886 + }, + { + "epoch": 0.7084664536741214, + "grad_norm": 0.04334057494997978, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 887 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 0.15619881451129913, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 888 + }, + { + "epoch": 0.7100638977635783, + "grad_norm": 0.26618269085884094, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 889 + }, + { + "epoch": 0.7108626198083067, + "grad_norm": 0.1834406554698944, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 890 + }, + { + "epoch": 0.7116613418530351, + "grad_norm": 0.08332087099552155, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 891 + }, + { + "epoch": 0.7124600638977636, + "grad_norm": 0.23721523582935333, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 892 + }, + { + "epoch": 0.713258785942492, + "grad_norm": 0.2912815809249878, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 893 + }, + { + "epoch": 0.7140575079872205, + "grad_norm": 0.25534820556640625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 894 + }, + { + "epoch": 0.7148562300319489, + "grad_norm": 0.14200575649738312, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 895 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.08668249845504761, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 896 + }, + { + "epoch": 0.7164536741214057, + "grad_norm": 0.2358543574810028, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 897 + }, + { + "epoch": 0.7172523961661342, + "grad_norm": 0.2729748487472534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 898 + }, + { + "epoch": 0.7180511182108626, + "grad_norm": 0.14862589538097382, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 899 + }, + { + "epoch": 0.7188498402555911, + "grad_norm": 0.14500044286251068, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 900 + }, + { + "epoch": 0.7196485623003195, + "grad_norm": 0.28659892082214355, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 901 + }, + { + "epoch": 0.7204472843450479, + "grad_norm": 0.2974075376987457, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 902 + }, + { + "epoch": 0.7212460063897763, + "grad_norm": 0.07839605212211609, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 903 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 0.2542141079902649, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 904 + }, + { + "epoch": 0.7228434504792333, + "grad_norm": 0.357192724943161, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 905 + }, + { + "epoch": 0.7236421725239617, + "grad_norm": 0.21535371243953705, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 906 + }, + { + "epoch": 0.7244408945686901, + "grad_norm": 0.08053386211395264, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 907 + }, + { + "epoch": 0.7252396166134185, + "grad_norm": 0.22670729458332062, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 908 + }, + { + "epoch": 0.7260383386581469, + "grad_norm": 0.21510791778564453, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 909 + }, + { + "epoch": 0.7268370607028753, + "grad_norm": 0.07556216418743134, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 910 + }, + { + "epoch": 0.7276357827476039, + "grad_norm": 0.08772645890712738, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 911 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.2531013488769531, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 912 + }, + { + "epoch": 0.7292332268370607, + "grad_norm": 0.06658858805894852, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 913 + }, + { + "epoch": 0.7300319488817891, + "grad_norm": 0.09869293123483658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 914 + }, + { + "epoch": 0.7308306709265175, + "grad_norm": 0.17758162319660187, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 915 + }, + { + "epoch": 0.731629392971246, + "grad_norm": 0.16267521679401398, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 916 + }, + { + "epoch": 0.7324281150159745, + "grad_norm": 0.09948690980672836, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 917 + }, + { + "epoch": 0.7332268370607029, + "grad_norm": 0.05900302529335022, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 918 + }, + { + "epoch": 0.7340255591054313, + "grad_norm": 0.08200150728225708, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 919 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 0.09217624366283417, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 920 + }, + { + "epoch": 0.7356230031948882, + "grad_norm": 0.12414196133613586, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 921 + }, + { + "epoch": 0.7364217252396166, + "grad_norm": 0.131890669465065, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 922 + }, + { + "epoch": 0.737220447284345, + "grad_norm": 0.1187182292342186, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 923 + }, + { + "epoch": 0.7380191693290735, + "grad_norm": 0.09890205413103104, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 924 + }, + { + "epoch": 0.7388178913738019, + "grad_norm": 0.06730851531028748, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 925 + }, + { + "epoch": 0.7396166134185304, + "grad_norm": 0.038627006113529205, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 926 + }, + { + "epoch": 0.7404153354632588, + "grad_norm": 0.07148899137973785, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 927 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 0.05876476690173149, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 928 + }, + { + "epoch": 0.7420127795527156, + "grad_norm": 0.11069595813751221, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 929 + }, + { + "epoch": 0.7428115015974441, + "grad_norm": 0.10409362614154816, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 930 + }, + { + "epoch": 0.7436102236421726, + "grad_norm": 0.08115468919277191, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 931 + }, + { + "epoch": 0.744408945686901, + "grad_norm": 0.14105193316936493, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 932 + }, + { + "epoch": 0.7452076677316294, + "grad_norm": 0.07780246436595917, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 933 + }, + { + "epoch": 0.7460063897763578, + "grad_norm": 0.08895678073167801, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 934 + }, + { + "epoch": 0.7468051118210862, + "grad_norm": 0.10844068974256516, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 935 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 0.07179753482341766, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 936 + }, + { + "epoch": 0.7484025559105432, + "grad_norm": 0.11107192933559418, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 937 + }, + { + "epoch": 0.7492012779552716, + "grad_norm": 0.2845052480697632, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 938 + }, + { + "epoch": 0.75, + "grad_norm": 0.41480058431625366, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 939 + }, + { + "epoch": 0.7507987220447284, + "grad_norm": 0.3101426064968109, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 940 + }, + { + "epoch": 0.7515974440894568, + "grad_norm": 0.09521801024675369, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 941 + }, + { + "epoch": 0.7523961661341853, + "grad_norm": 0.18613341450691223, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 942 + }, + { + "epoch": 0.7531948881789138, + "grad_norm": 0.2665672302246094, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 943 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.20693817734718323, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 944 + }, + { + "epoch": 0.7547923322683706, + "grad_norm": 0.05853262171149254, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 945 + }, + { + "epoch": 0.755591054313099, + "grad_norm": 0.22123664617538452, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 946 + }, + { + "epoch": 0.7563897763578274, + "grad_norm": 0.2845379114151001, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 947 + }, + { + "epoch": 0.7571884984025559, + "grad_norm": 0.20357397198677063, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 948 + }, + { + "epoch": 0.7579872204472844, + "grad_norm": 0.0897352546453476, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 949 + }, + { + "epoch": 0.7587859424920128, + "grad_norm": 0.06572771817445755, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 950 + }, + { + "epoch": 0.7595846645367412, + "grad_norm": 0.09441806375980377, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 951 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 0.06848953664302826, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 952 + }, + { + "epoch": 0.7611821086261981, + "grad_norm": 0.127177432179451, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 953 + }, + { + "epoch": 0.7619808306709265, + "grad_norm": 0.25466713309288025, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 954 + }, + { + "epoch": 0.762779552715655, + "grad_norm": 0.32952556014060974, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 955 + }, + { + "epoch": 0.7635782747603834, + "grad_norm": 0.2976897358894348, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 956 + }, + { + "epoch": 0.7643769968051118, + "grad_norm": 0.17444387078285217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 957 + }, + { + "epoch": 0.7651757188498403, + "grad_norm": 0.10458981990814209, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 958 + }, + { + "epoch": 0.7659744408945687, + "grad_norm": 0.07028939574956894, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 959 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.1888386309146881, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 960 + }, + { + "epoch": 0.7675718849840255, + "grad_norm": 0.19400012493133545, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 961 + }, + { + "epoch": 0.768370607028754, + "grad_norm": 0.12069790065288544, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 962 + }, + { + "epoch": 0.7691693290734825, + "grad_norm": 0.06206851452589035, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 963 + }, + { + "epoch": 0.7699680511182109, + "grad_norm": 0.07195326685905457, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 964 + }, + { + "epoch": 0.7707667731629393, + "grad_norm": 0.09240477532148361, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 965 + }, + { + "epoch": 0.7715654952076677, + "grad_norm": 0.04433378204703331, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 966 + }, + { + "epoch": 0.7723642172523961, + "grad_norm": 0.07411819696426392, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 967 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 0.11440210789442062, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 968 + }, + { + "epoch": 0.7739616613418531, + "grad_norm": 0.23913118243217468, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 969 + }, + { + "epoch": 0.7747603833865815, + "grad_norm": 0.31028202176094055, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 970 + }, + { + "epoch": 0.7755591054313099, + "grad_norm": 0.3343825936317444, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 971 + }, + { + "epoch": 0.7763578274760383, + "grad_norm": 0.2559935748577118, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 972 + }, + { + "epoch": 0.7771565495207667, + "grad_norm": 0.05685359239578247, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 973 + }, + { + "epoch": 0.7779552715654952, + "grad_norm": 0.1760183721780777, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 974 + }, + { + "epoch": 0.7787539936102237, + "grad_norm": 0.25240832567214966, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 975 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.13724291324615479, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 976 + }, + { + "epoch": 0.7803514376996805, + "grad_norm": 0.11687567830085754, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 977 + }, + { + "epoch": 0.7811501597444089, + "grad_norm": 0.31319329142570496, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 978 + }, + { + "epoch": 0.7819488817891374, + "grad_norm": 0.3297184705734253, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 979 + }, + { + "epoch": 0.7827476038338658, + "grad_norm": 0.19443389773368835, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 980 + }, + { + "epoch": 0.7835463258785943, + "grad_norm": 0.04911043494939804, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 981 + }, + { + "epoch": 0.7843450479233227, + "grad_norm": 0.19837717711925507, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 982 + }, + { + "epoch": 0.7851437699680511, + "grad_norm": 0.23165349662303925, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 983 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 0.12156365066766739, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 984 + }, + { + "epoch": 0.786741214057508, + "grad_norm": 0.1305016428232193, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 985 + }, + { + "epoch": 0.7875399361022364, + "grad_norm": 0.12228422611951828, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 986 + }, + { + "epoch": 0.7883386581469649, + "grad_norm": 0.09014695137739182, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 987 + }, + { + "epoch": 0.7891373801916933, + "grad_norm": 0.060052234679460526, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 988 + }, + { + "epoch": 0.7899361022364217, + "grad_norm": 0.17842933535575867, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 989 + }, + { + "epoch": 0.7907348242811502, + "grad_norm": 0.2823020815849304, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 990 + }, + { + "epoch": 0.7915335463258786, + "grad_norm": 0.2571483254432678, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 991 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.11443623155355453, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 992 + }, + { + "epoch": 0.7931309904153354, + "grad_norm": 0.09048285335302353, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 993 + }, + { + "epoch": 0.7939297124600639, + "grad_norm": 0.1863749772310257, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 994 + }, + { + "epoch": 0.7947284345047924, + "grad_norm": 0.1481461524963379, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 995 + }, + { + "epoch": 0.7955271565495208, + "grad_norm": 0.06870540231466293, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 996 + }, + { + "epoch": 0.7963258785942492, + "grad_norm": 0.04223543405532837, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 997 + }, + { + "epoch": 0.7971246006389776, + "grad_norm": 0.04194851219654083, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 998 + }, + { + "epoch": 0.797923322683706, + "grad_norm": 0.03982497751712799, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 999 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 0.20985758304595947, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1000 + }, + { + "epoch": 0.799520766773163, + "grad_norm": 0.11346526443958282, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1001 + }, + { + "epoch": 0.8003194888178914, + "grad_norm": 0.16594401001930237, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1002 + }, + { + "epoch": 0.8011182108626198, + "grad_norm": 0.1788545846939087, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1003 + }, + { + "epoch": 0.8019169329073482, + "grad_norm": 0.07928512245416641, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1004 + }, + { + "epoch": 0.8027156549520766, + "grad_norm": 0.0953991562128067, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1005 + }, + { + "epoch": 0.8035143769968051, + "grad_norm": 0.2052081823348999, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1006 + }, + { + "epoch": 0.8043130990415336, + "grad_norm": 0.1999465525150299, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1007 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.09821965545415878, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1008 + }, + { + "epoch": 0.8059105431309904, + "grad_norm": 0.0762021467089653, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1009 + }, + { + "epoch": 0.8067092651757188, + "grad_norm": 0.20475991070270538, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 0.8075079872204473, + "grad_norm": 0.23028631508350372, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1011 + }, + { + "epoch": 0.8083067092651757, + "grad_norm": 0.12122747302055359, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.8091054313099042, + "grad_norm": 0.08124672621488571, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1013 + }, + { + "epoch": 0.8099041533546326, + "grad_norm": 0.21313415467739105, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1014 + }, + { + "epoch": 0.810702875399361, + "grad_norm": 0.311813622713089, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1015 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 0.3032541275024414, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1016 + }, + { + "epoch": 0.8123003194888179, + "grad_norm": 0.21727560460567474, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1017 + }, + { + "epoch": 0.8130990415335463, + "grad_norm": 0.0620480477809906, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1018 + }, + { + "epoch": 0.8138977635782748, + "grad_norm": 0.20105740427970886, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1019 + }, + { + "epoch": 0.8146964856230032, + "grad_norm": 0.28996244072914124, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1020 + }, + { + "epoch": 0.8154952076677316, + "grad_norm": 0.22115157544612885, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1021 + }, + { + "epoch": 0.8162939297124601, + "grad_norm": 0.10071029514074326, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1022 + }, + { + "epoch": 0.8170926517571885, + "grad_norm": 0.12363877147436142, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1023 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.29970163106918335, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1024 + }, + { + "epoch": 0.8186900958466453, + "grad_norm": 0.32754749059677124, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1025 + }, + { + "epoch": 0.8194888178913738, + "grad_norm": 0.20028825104236603, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1026 + }, + { + "epoch": 0.8202875399361023, + "grad_norm": 0.08162792772054672, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1027 + }, + { + "epoch": 0.8210862619808307, + "grad_norm": 0.27463749051094055, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1028 + }, + { + "epoch": 0.8218849840255591, + "grad_norm": 0.30335354804992676, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1029 + }, + { + "epoch": 0.8226837060702875, + "grad_norm": 0.12106633186340332, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1030 + }, + { + "epoch": 0.8234824281150159, + "grad_norm": 0.16331955790519714, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1031 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 0.2764187455177307, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1032 + }, + { + "epoch": 0.8250798722044729, + "grad_norm": 0.20136456191539764, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1033 + }, + { + "epoch": 0.8258785942492013, + "grad_norm": 0.06438590586185455, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1034 + }, + { + "epoch": 0.8266773162939297, + "grad_norm": 0.18764367699623108, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1035 + }, + { + "epoch": 0.8274760383386581, + "grad_norm": 0.20327645540237427, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1036 + }, + { + "epoch": 0.8282747603833865, + "grad_norm": 0.08825036138296127, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1037 + }, + { + "epoch": 0.829073482428115, + "grad_norm": 0.11037785559892654, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1038 + }, + { + "epoch": 0.8298722044728435, + "grad_norm": 0.18273280560970306, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1039 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.16820372641086578, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1040 + }, + { + "epoch": 0.8314696485623003, + "grad_norm": 0.06250625103712082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1041 + }, + { + "epoch": 0.8322683706070287, + "grad_norm": 0.12141115218400955, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1042 + }, + { + "epoch": 0.8330670926517572, + "grad_norm": 0.13594450056552887, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1043 + }, + { + "epoch": 0.8338658146964856, + "grad_norm": 0.16069599986076355, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1044 + }, + { + "epoch": 0.8346645367412141, + "grad_norm": 0.11631255596876144, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1045 + }, + { + "epoch": 0.8354632587859425, + "grad_norm": 0.050075192004442215, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1046 + }, + { + "epoch": 0.8362619808306709, + "grad_norm": 0.06317511945962906, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1047 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 0.09078527241945267, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1048 + }, + { + "epoch": 0.8378594249201278, + "grad_norm": 0.1618194878101349, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1049 + }, + { + "epoch": 0.8386581469648562, + "grad_norm": 0.2044777274131775, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1050 + }, + { + "epoch": 0.8394568690095847, + "grad_norm": 0.20439067482948303, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.8402555910543131, + "grad_norm": 0.1967901587486267, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1052 + }, + { + "epoch": 0.8410543130990416, + "grad_norm": 0.06829354166984558, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1053 + }, + { + "epoch": 0.84185303514377, + "grad_norm": 0.12168806046247482, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1054 + }, + { + "epoch": 0.8426517571884984, + "grad_norm": 0.23461978137493134, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1055 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.28916484117507935, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1056 + }, + { + "epoch": 0.8442492012779552, + "grad_norm": 0.21827733516693115, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1057 + }, + { + "epoch": 0.8450479233226837, + "grad_norm": 0.045396093279123306, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1058 + }, + { + "epoch": 0.8458466453674122, + "grad_norm": 0.2391543984413147, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1059 + }, + { + "epoch": 0.8466453674121406, + "grad_norm": 0.2916122078895569, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1060 + }, + { + "epoch": 0.847444089456869, + "grad_norm": 0.1589413434267044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1061 + }, + { + "epoch": 0.8482428115015974, + "grad_norm": 0.14869733154773712, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1062 + }, + { + "epoch": 0.8490415335463258, + "grad_norm": 0.3719956874847412, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 1063 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1064 + }, + { + "epoch": 0.8506389776357828, + "grad_norm": 0.22647641599178314, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1065 + }, + { + "epoch": 0.8514376996805112, + "grad_norm": 0.14329837262630463, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1066 + }, + { + "epoch": 0.8522364217252396, + "grad_norm": 0.2508337199687958, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1067 + }, + { + "epoch": 0.853035143769968, + "grad_norm": 0.16483807563781738, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1068 + }, + { + "epoch": 0.8538338658146964, + "grad_norm": 0.08231265842914581, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1069 + }, + { + "epoch": 0.854632587859425, + "grad_norm": 0.15707719326019287, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1070 + }, + { + "epoch": 0.8554313099041534, + "grad_norm": 0.1741408407688141, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1071 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.06281771510839462, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1072 + }, + { + "epoch": 0.8570287539936102, + "grad_norm": 0.10936494171619415, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1073 + }, + { + "epoch": 0.8578274760383386, + "grad_norm": 0.08680932223796844, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1074 + }, + { + "epoch": 0.8586261980830671, + "grad_norm": 0.05679824575781822, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1075 + }, + { + "epoch": 0.8594249201277955, + "grad_norm": 0.07635466009378433, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1076 + }, + { + "epoch": 0.860223642172524, + "grad_norm": 0.08391202241182327, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 1077 + }, + { + "epoch": 0.8610223642172524, + "grad_norm": 0.044910602271556854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1078 + }, + { + "epoch": 0.8618210862619808, + "grad_norm": 0.07833745330572128, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 1079 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 0.11653397232294083, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1080 + }, + { + "epoch": 0.8634185303514377, + "grad_norm": 0.09041672199964523, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1081 + }, + { + "epoch": 0.8642172523961661, + "grad_norm": 0.061735767871141434, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1082 + }, + { + "epoch": 0.8650159744408946, + "grad_norm": 0.042857520282268524, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1083 + }, + { + "epoch": 0.865814696485623, + "grad_norm": 0.040145136415958405, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1084 + }, + { + "epoch": 0.8666134185303515, + "grad_norm": 0.05785573646426201, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1085 + }, + { + "epoch": 0.8674121405750799, + "grad_norm": 0.13503877818584442, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1086 + }, + { + "epoch": 0.8682108626198083, + "grad_norm": 0.16243800520896912, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1087 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.13211014866828918, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1088 + }, + { + "epoch": 0.8698083067092651, + "grad_norm": 0.08136262744665146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1089 + }, + { + "epoch": 0.8706070287539937, + "grad_norm": 0.07881205528974533, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1090 + }, + { + "epoch": 0.8714057507987221, + "grad_norm": 0.1660437136888504, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1091 + }, + { + "epoch": 0.8722044728434505, + "grad_norm": 0.1955040693283081, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1092 + }, + { + "epoch": 0.8730031948881789, + "grad_norm": 0.18039803206920624, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1093 + }, + { + "epoch": 0.8738019169329073, + "grad_norm": 0.13832250237464905, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1094 + }, + { + "epoch": 0.8746006389776357, + "grad_norm": 0.06982281059026718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1095 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 0.06607141345739365, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1096 + }, + { + "epoch": 0.8761980830670927, + "grad_norm": 0.08685869723558426, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1097 + }, + { + "epoch": 0.8769968051118211, + "grad_norm": 0.09157849103212357, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1098 + }, + { + "epoch": 0.8777955271565495, + "grad_norm": 0.05980607122182846, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1099 + }, + { + "epoch": 0.8785942492012779, + "grad_norm": 0.05037426948547363, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1100 + }, + { + "epoch": 0.8793929712460063, + "grad_norm": 0.09998175501823425, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 1101 + }, + { + "epoch": 0.8801916932907349, + "grad_norm": 0.14255133271217346, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1102 + }, + { + "epoch": 0.8809904153354633, + "grad_norm": 0.1332579255104065, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1103 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.06453413516283035, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1104 + }, + { + "epoch": 0.8825878594249201, + "grad_norm": 0.07107783854007721, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1105 + }, + { + "epoch": 0.8833865814696485, + "grad_norm": 0.14025849103927612, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1106 + }, + { + "epoch": 0.884185303514377, + "grad_norm": 0.18791186809539795, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1107 + }, + { + "epoch": 0.8849840255591054, + "grad_norm": 0.228570356965065, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1108 + }, + { + "epoch": 0.8857827476038339, + "grad_norm": 0.21574346721172333, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1109 + }, + { + "epoch": 0.8865814696485623, + "grad_norm": 0.14833906292915344, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1110 + }, + { + "epoch": 0.8873801916932907, + "grad_norm": 0.04756765812635422, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1111 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 0.13023658096790314, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1112 + }, + { + "epoch": 0.8889776357827476, + "grad_norm": 0.21199558675289154, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1113 + }, + { + "epoch": 0.889776357827476, + "grad_norm": 0.19635719060897827, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1114 + }, + { + "epoch": 0.8905750798722045, + "grad_norm": 0.14753709733486176, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1115 + }, + { + "epoch": 0.8913738019169329, + "grad_norm": 0.06639572232961655, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1116 + }, + { + "epoch": 0.8921725239616614, + "grad_norm": 0.09707840532064438, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1117 + }, + { + "epoch": 0.8929712460063898, + "grad_norm": 0.20057998597621918, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 1118 + }, + { + "epoch": 0.8937699680511182, + "grad_norm": 0.232718825340271, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1119 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.16340196132659912, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1120 + }, + { + "epoch": 0.895367412140575, + "grad_norm": 0.04553915560245514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 1121 + }, + { + "epoch": 0.8961661341853036, + "grad_norm": 0.12561571598052979, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1122 + }, + { + "epoch": 0.896964856230032, + "grad_norm": 0.19254666566848755, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1123 + }, + { + "epoch": 0.8977635782747604, + "grad_norm": 0.12862572073936462, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1124 + }, + { + "epoch": 0.8985623003194888, + "grad_norm": 0.051237158477306366, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1125 + }, + { + "epoch": 0.8993610223642172, + "grad_norm": 0.18603810667991638, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1126 + }, + { + "epoch": 0.9001597444089456, + "grad_norm": 0.2498294860124588, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1127 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 0.18809954822063446, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1128 + }, + { + "epoch": 0.9017571884984026, + "grad_norm": 0.06116599217057228, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1129 + }, + { + "epoch": 0.902555910543131, + "grad_norm": 0.07710137963294983, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1130 + }, + { + "epoch": 0.9033546325878594, + "grad_norm": 0.11208303272724152, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1131 + }, + { + "epoch": 0.9041533546325878, + "grad_norm": 0.11864814907312393, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1132 + }, + { + "epoch": 0.9049520766773163, + "grad_norm": 0.1261119246482849, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1133 + }, + { + "epoch": 0.9057507987220448, + "grad_norm": 0.10841526836156845, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1134 + }, + { + "epoch": 0.9065495207667732, + "grad_norm": 0.04871276393532753, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1135 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.08953645080327988, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1136 + }, + { + "epoch": 0.90814696485623, + "grad_norm": 0.1590365469455719, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1137 + }, + { + "epoch": 0.9089456869009584, + "grad_norm": 0.155691459774971, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1138 + }, + { + "epoch": 0.9097444089456869, + "grad_norm": 0.09982484579086304, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1139 + }, + { + "epoch": 0.9105431309904153, + "grad_norm": 0.08257611095905304, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1140 + }, + { + "epoch": 0.9113418530351438, + "grad_norm": 0.1036139577627182, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1141 + }, + { + "epoch": 0.9121405750798722, + "grad_norm": 0.06543707102537155, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1142 + }, + { + "epoch": 0.9129392971246006, + "grad_norm": 0.05375903844833374, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1143 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 0.13674795627593994, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1144 + }, + { + "epoch": 0.9145367412140575, + "grad_norm": 0.21575352549552917, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 1145 + }, + { + "epoch": 0.9153354632587859, + "grad_norm": 0.22478559613227844, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1146 + }, + { + "epoch": 0.9161341853035144, + "grad_norm": 0.1854555904865265, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1147 + }, + { + "epoch": 0.9169329073482428, + "grad_norm": 0.08605340123176575, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1148 + }, + { + "epoch": 0.9177316293929713, + "grad_norm": 0.14082656800746918, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1149 + }, + { + "epoch": 0.9185303514376997, + "grad_norm": 0.3214903771877289, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1150 + }, + { + "epoch": 0.9193290734824281, + "grad_norm": 0.4360012412071228, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 1151 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.3582250773906708, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1152 + }, + { + "epoch": 0.920926517571885, + "grad_norm": 0.1142783984541893, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1153 + }, + { + "epoch": 0.9217252396166135, + "grad_norm": 0.2035343497991562, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1154 + }, + { + "epoch": 0.9225239616613419, + "grad_norm": 0.3506172299385071, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1155 + }, + { + "epoch": 0.9233226837060703, + "grad_norm": 0.2129906564950943, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1156 + }, + { + "epoch": 0.9241214057507987, + "grad_norm": 0.12158108502626419, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1157 + }, + { + "epoch": 0.9249201277955271, + "grad_norm": 0.3931717872619629, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1158 + }, + { + "epoch": 0.9257188498402555, + "grad_norm": 0.36336907744407654, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1159 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 0.06781382113695145, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1160 + }, + { + "epoch": 0.9273162939297125, + "grad_norm": 0.3335910141468048, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1161 + }, + { + "epoch": 0.9281150159744409, + "grad_norm": 0.5017055869102478, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 1162 + }, + { + "epoch": 0.9289137380191693, + "grad_norm": 0.3635455071926117, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1163 + }, + { + "epoch": 0.9297124600638977, + "grad_norm": 0.06748906522989273, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1164 + }, + { + "epoch": 0.9305111821086262, + "grad_norm": 0.3723882734775543, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1165 + }, + { + "epoch": 0.9313099041533547, + "grad_norm": 0.2976631820201874, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1166 + }, + { + "epoch": 0.9321086261980831, + "grad_norm": 0.06998804211616516, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1167 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.3307324945926666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1168 + }, + { + "epoch": 0.9337060702875399, + "grad_norm": 0.29726436734199524, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1169 + }, + { + "epoch": 0.9345047923322684, + "grad_norm": 0.048596691340208054, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1170 + }, + { + "epoch": 0.9353035143769968, + "grad_norm": 0.2840823233127594, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1171 + }, + { + "epoch": 0.9361022364217252, + "grad_norm": 0.31426292657852173, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1172 + }, + { + "epoch": 0.9369009584664537, + "grad_norm": 0.16073261201381683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1173 + }, + { + "epoch": 0.9376996805111821, + "grad_norm": 0.05725392326712608, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1174 + }, + { + "epoch": 0.9384984025559105, + "grad_norm": 0.1674586981534958, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1175 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 0.13738949596881866, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1176 + }, + { + "epoch": 0.9400958466453674, + "grad_norm": 0.05350235849618912, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1177 + }, + { + "epoch": 0.9408945686900958, + "grad_norm": 0.10518805682659149, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.9416932907348243, + "grad_norm": 0.11264974623918533, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1179 + }, + { + "epoch": 0.9424920127795527, + "grad_norm": 0.06757227331399918, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1180 + }, + { + "epoch": 0.9432907348242812, + "grad_norm": 0.07214303314685822, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1181 + }, + { + "epoch": 0.9440894568690096, + "grad_norm": 0.12705406546592712, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1182 + }, + { + "epoch": 0.944888178913738, + "grad_norm": 0.09937570244073868, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 1183 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 0.05628623813390732, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1184 + }, + { + "epoch": 0.9464856230031949, + "grad_norm": 0.05685505270957947, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1185 + }, + { + "epoch": 0.9472843450479234, + "grad_norm": 0.06150783598423004, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1186 + }, + { + "epoch": 0.9480830670926518, + "grad_norm": 0.04247362166643143, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1187 + }, + { + "epoch": 0.9488817891373802, + "grad_norm": 0.05664962902665138, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1188 + }, + { + "epoch": 0.9496805111821086, + "grad_norm": 0.07421324402093887, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1189 + }, + { + "epoch": 0.950479233226837, + "grad_norm": 0.043645020574331284, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1190 + }, + { + "epoch": 0.9512779552715654, + "grad_norm": 0.0692208856344223, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1191 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 0.13804891705513, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1192 + }, + { + "epoch": 0.9528753993610224, + "grad_norm": 0.14874884486198425, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1193 + }, + { + "epoch": 0.9536741214057508, + "grad_norm": 0.08449128270149231, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1194 + }, + { + "epoch": 0.9544728434504792, + "grad_norm": 0.035032968968153, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1195 + }, + { + "epoch": 0.9552715654952076, + "grad_norm": 0.10837965458631516, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1196 + }, + { + "epoch": 0.9560702875399361, + "grad_norm": 0.17972581088542938, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1197 + }, + { + "epoch": 0.9568690095846646, + "grad_norm": 0.17075787484645844, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1198 + }, + { + "epoch": 0.957667731629393, + "grad_norm": 0.08269231766462326, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1199 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.07269515842199326, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1200 + }, + { + "epoch": 0.9592651757188498, + "grad_norm": 0.15345947444438934, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1201 + }, + { + "epoch": 0.9600638977635783, + "grad_norm": 0.19025452435016632, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1202 + }, + { + "epoch": 0.9608626198083067, + "grad_norm": 0.1782686710357666, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1203 + }, + { + "epoch": 0.9616613418530351, + "grad_norm": 0.1296931356191635, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1204 + }, + { + "epoch": 0.9624600638977636, + "grad_norm": 0.036208219826221466, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1205 + }, + { + "epoch": 0.963258785942492, + "grad_norm": 0.14282052218914032, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1206 + }, + { + "epoch": 0.9640575079872205, + "grad_norm": 0.26539498567581177, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1207 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 0.28352224826812744, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1208 + }, + { + "epoch": 0.9656549520766773, + "grad_norm": 0.14476369321346283, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1209 + }, + { + "epoch": 0.9664536741214057, + "grad_norm": 0.06859725713729858, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1210 + }, + { + "epoch": 0.9672523961661342, + "grad_norm": 0.19093726575374603, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1211 + }, + { + "epoch": 0.9680511182108626, + "grad_norm": 0.1848185807466507, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1212 + }, + { + "epoch": 0.9688498402555911, + "grad_norm": 0.05829976871609688, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1213 + }, + { + "epoch": 0.9696485623003195, + "grad_norm": 0.10105405002832413, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1214 + }, + { + "epoch": 0.9704472843450479, + "grad_norm": 0.12762011587619781, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1215 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.08238376677036285, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1216 + }, + { + "epoch": 0.9720447284345048, + "grad_norm": 0.07039444148540497, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1217 + }, + { + "epoch": 0.9728434504792333, + "grad_norm": 0.1320599615573883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1218 + }, + { + "epoch": 0.9736421725239617, + "grad_norm": 0.07799404859542847, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1219 + }, + { + "epoch": 0.9744408945686901, + "grad_norm": 0.11601961404085159, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1220 + }, + { + "epoch": 0.9752396166134185, + "grad_norm": 0.26134374737739563, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1221 + }, + { + "epoch": 0.9760383386581469, + "grad_norm": 0.275513231754303, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1222 + }, + { + "epoch": 0.9768370607028753, + "grad_norm": 0.0711631178855896, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1223 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 0.1879139244556427, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1224 + }, + { + "epoch": 0.9784345047923323, + "grad_norm": 0.24822647869586945, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1225 + }, + { + "epoch": 0.9792332268370607, + "grad_norm": 0.1244853138923645, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1226 + }, + { + "epoch": 0.9800319488817891, + "grad_norm": 0.07694529742002487, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1227 + }, + { + "epoch": 0.9808306709265175, + "grad_norm": 0.1280626803636551, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1228 + }, + { + "epoch": 0.981629392971246, + "grad_norm": 0.09127703309059143, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1229 + }, + { + "epoch": 0.9824281150159745, + "grad_norm": 0.06747932732105255, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1230 + }, + { + "epoch": 0.9832268370607029, + "grad_norm": 0.08196533471345901, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1231 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.09074689447879791, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1232 + }, + { + "epoch": 0.9848242811501597, + "grad_norm": 0.06031282991170883, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1233 + }, + { + "epoch": 0.9856230031948882, + "grad_norm": 0.07138215005397797, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1234 + }, + { + "epoch": 0.9864217252396166, + "grad_norm": 0.11056806892156601, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1235 + }, + { + "epoch": 0.987220447284345, + "grad_norm": 0.09108638018369675, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1236 + }, + { + "epoch": 0.9880191693290735, + "grad_norm": 0.0515020377933979, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1237 + }, + { + "epoch": 0.9888178913738019, + "grad_norm": 0.08467873930931091, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1238 + }, + { + "epoch": 0.9896166134185304, + "grad_norm": 0.10424523055553436, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1239 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 0.11506868153810501, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1240 + }, + { + "epoch": 0.9912140575079872, + "grad_norm": 0.13226476311683655, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1241 + }, + { + "epoch": 0.9920127795527156, + "grad_norm": 0.13714630901813507, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1242 + }, + { + "epoch": 0.9928115015974441, + "grad_norm": 0.08985403180122375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1243 + }, + { + "epoch": 0.9936102236421726, + "grad_norm": 0.1107666939496994, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 1244 + }, + { + "epoch": 0.994408945686901, + "grad_norm": 0.130653515458107, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1245 + }, + { + "epoch": 0.9952076677316294, + "grad_norm": 0.10675778985023499, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1246 + }, + { + "epoch": 0.9960063897763578, + "grad_norm": 0.042045243084430695, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1247 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.07957674562931061, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1248 + }, + { + "epoch": 0.9976038338658147, + "grad_norm": 0.06926224380731583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1249 + }, + { + "epoch": 0.9984025559105432, + "grad_norm": 0.0849846750497818, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1250 + }, + { + "epoch": 0.9992012779552716, + "grad_norm": 0.12501482665538788, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1251 + }, + { + "epoch": 1.0, + "grad_norm": 0.1467234194278717, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1252 + }, + { + "epoch": 1.0007987220447285, + "grad_norm": 0.11206725984811783, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1253 + }, + { + "epoch": 1.0015974440894568, + "grad_norm": 0.05224297568202019, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1254 + }, + { + "epoch": 1.0023961661341854, + "grad_norm": 0.15176911652088165, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1255 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.22419261932373047, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1256 + }, + { + "epoch": 1.0039936102236422, + "grad_norm": 0.18444369733333588, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1257 + }, + { + "epoch": 1.0047923322683705, + "grad_norm": 0.06510337442159653, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1258 + }, + { + "epoch": 1.005591054313099, + "grad_norm": 0.16058789193630219, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1259 + }, + { + "epoch": 1.0063897763578276, + "grad_norm": 0.22726313769817352, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1260 + }, + { + "epoch": 1.0071884984025559, + "grad_norm": 0.21050630509853363, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1261 + }, + { + "epoch": 1.0079872204472844, + "grad_norm": 0.09227188676595688, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1262 + }, + { + "epoch": 1.0087859424920127, + "grad_norm": 0.11473584920167923, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 1263 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 0.12692919373512268, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1264 + }, + { + "epoch": 1.0103833865814698, + "grad_norm": 0.056371819227933884, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1265 + }, + { + "epoch": 1.011182108626198, + "grad_norm": 0.13166245818138123, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1266 + }, + { + "epoch": 1.0119808306709266, + "grad_norm": 0.2606523633003235, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1267 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 0.320832759141922, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1268 + }, + { + "epoch": 1.0135782747603834, + "grad_norm": 0.2074427455663681, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1269 + }, + { + "epoch": 1.0143769968051117, + "grad_norm": 0.05768958851695061, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1270 + }, + { + "epoch": 1.0151757188498403, + "grad_norm": 0.08107002079486847, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1271 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 0.12996292114257812, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1272 + }, + { + "epoch": 1.016773162939297, + "grad_norm": 0.1514650285243988, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1273 + }, + { + "epoch": 1.0175718849840256, + "grad_norm": 0.1007395088672638, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1274 + }, + { + "epoch": 1.018370607028754, + "grad_norm": 0.0831306204199791, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1275 + }, + { + "epoch": 1.0191693290734825, + "grad_norm": 0.09004336595535278, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1276 + }, + { + "epoch": 1.0199680511182108, + "grad_norm": 0.06632232666015625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1277 + }, + { + "epoch": 1.0207667731629393, + "grad_norm": 0.05073424428701401, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1278 + }, + { + "epoch": 1.0215654952076678, + "grad_norm": 0.06486333161592484, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 1279 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 0.1137472614645958, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1280 + }, + { + "epoch": 1.0231629392971247, + "grad_norm": 0.08062250912189484, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1281 + }, + { + "epoch": 1.023961661341853, + "grad_norm": 0.05046350136399269, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1282 + }, + { + "epoch": 1.0247603833865815, + "grad_norm": 0.06503880023956299, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1283 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 0.10730332881212234, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1284 + }, + { + "epoch": 1.0263578274760383, + "grad_norm": 0.12077611684799194, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1285 + }, + { + "epoch": 1.0271565495207668, + "grad_norm": 0.15061219036579132, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1286 + }, + { + "epoch": 1.0279552715654952, + "grad_norm": 0.15091058611869812, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1287 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 0.07299874722957611, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1288 + }, + { + "epoch": 1.029552715654952, + "grad_norm": 0.09598413854837418, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1289 + }, + { + "epoch": 1.0303514376996805, + "grad_norm": 0.21661055088043213, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 1290 + }, + { + "epoch": 1.031150159744409, + "grad_norm": 0.24777255952358246, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1291 + }, + { + "epoch": 1.0319488817891374, + "grad_norm": 0.17097236216068268, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1292 + }, + { + "epoch": 1.0327476038338659, + "grad_norm": 0.05266748368740082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1293 + }, + { + "epoch": 1.0335463258785942, + "grad_norm": 0.12484195083379745, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1294 + }, + { + "epoch": 1.0343450479233227, + "grad_norm": 0.1802505999803543, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1295 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 0.10778877139091492, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1296 + }, + { + "epoch": 1.0359424920127795, + "grad_norm": 0.046645063906908035, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1297 + }, + { + "epoch": 1.036741214057508, + "grad_norm": 0.11727745085954666, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1298 + }, + { + "epoch": 1.0375399361022364, + "grad_norm": 0.1356390118598938, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1299 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 0.08130940794944763, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1300 + }, + { + "epoch": 1.0391373801916932, + "grad_norm": 0.07274319976568222, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1301 + }, + { + "epoch": 1.0399361022364217, + "grad_norm": 0.20339541137218475, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1302 + }, + { + "epoch": 1.04073482428115, + "grad_norm": 0.27819424867630005, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 1303 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 0.25879770517349243, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1304 + }, + { + "epoch": 1.042332268370607, + "grad_norm": 0.12683863937854767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1305 + }, + { + "epoch": 1.0431309904153354, + "grad_norm": 0.13531504571437836, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1306 + }, + { + "epoch": 1.043929712460064, + "grad_norm": 0.3203699588775635, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1307 + }, + { + "epoch": 1.0447284345047922, + "grad_norm": 0.3073630630970001, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1308 + }, + { + "epoch": 1.0455271565495208, + "grad_norm": 0.13184015452861786, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1309 + }, + { + "epoch": 1.0463258785942493, + "grad_norm": 0.1311715543270111, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1310 + }, + { + "epoch": 1.0471246006389776, + "grad_norm": 0.24470581114292145, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1311 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 0.21901719272136688, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1312 + }, + { + "epoch": 1.0487220447284344, + "grad_norm": 0.08105460554361343, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1313 + }, + { + "epoch": 1.049520766773163, + "grad_norm": 0.14864705502986908, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1314 + }, + { + "epoch": 1.0503194888178913, + "grad_norm": 0.20006732642650604, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1315 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 0.06233162060379982, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1316 + }, + { + "epoch": 1.0519169329073483, + "grad_norm": 0.12691672146320343, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1317 + }, + { + "epoch": 1.0527156549520766, + "grad_norm": 0.18303292989730835, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1318 + }, + { + "epoch": 1.0535143769968052, + "grad_norm": 0.13289928436279297, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1319 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 0.03847618028521538, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1320 + }, + { + "epoch": 1.055111821086262, + "grad_norm": 0.1317387968301773, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1321 + }, + { + "epoch": 1.0559105431309903, + "grad_norm": 0.1663348227739334, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1322 + }, + { + "epoch": 1.0567092651757188, + "grad_norm": 0.0657038614153862, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1323 + }, + { + "epoch": 1.0575079872204474, + "grad_norm": 0.1484680026769638, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1324 + }, + { + "epoch": 1.0583067092651757, + "grad_norm": 0.299824595451355, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1325 + }, + { + "epoch": 1.0591054313099042, + "grad_norm": 0.3598216772079468, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1326 + }, + { + "epoch": 1.0599041533546325, + "grad_norm": 0.25792455673217773, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1327 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 0.04925544187426567, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1328 + }, + { + "epoch": 1.0615015974440896, + "grad_norm": 0.2568669319152832, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1329 + }, + { + "epoch": 1.0623003194888179, + "grad_norm": 0.2679016590118408, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1330 + }, + { + "epoch": 1.0630990415335464, + "grad_norm": 0.12100119888782501, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1331 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 0.17324721813201904, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1332 + }, + { + "epoch": 1.0646964856230032, + "grad_norm": 0.34452658891677856, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1333 + }, + { + "epoch": 1.0654952076677315, + "grad_norm": 0.24561382830142975, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1334 + }, + { + "epoch": 1.06629392971246, + "grad_norm": 0.06080634891986847, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1335 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 0.249319925904274, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1336 + }, + { + "epoch": 1.067891373801917, + "grad_norm": 0.2586004436016083, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1337 + }, + { + "epoch": 1.0686900958466454, + "grad_norm": 0.07297322154045105, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1338 + }, + { + "epoch": 1.0694888178913737, + "grad_norm": 0.20853886008262634, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1339 + }, + { + "epoch": 1.0702875399361023, + "grad_norm": 0.3214154541492462, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1340 + }, + { + "epoch": 1.0710862619808306, + "grad_norm": 0.16169136762619019, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1341 + }, + { + "epoch": 1.071884984025559, + "grad_norm": 0.18989364802837372, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1342 + }, + { + "epoch": 1.0726837060702876, + "grad_norm": 0.42826735973358154, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1343 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 0.35387369990348816, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1344 + }, + { + "epoch": 1.0742811501597445, + "grad_norm": 0.061617862433195114, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 1345 + }, + { + "epoch": 1.0750798722044728, + "grad_norm": 0.3348129987716675, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1346 + }, + { + "epoch": 1.0758785942492013, + "grad_norm": 0.3622291088104248, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1347 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 0.12743668258190155, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1348 + }, + { + "epoch": 1.0774760383386581, + "grad_norm": 0.2464202642440796, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1349 + }, + { + "epoch": 1.0782747603833867, + "grad_norm": 0.3873802423477173, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1350 + }, + { + "epoch": 1.079073482428115, + "grad_norm": 0.22619839012622833, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1351 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 0.09080081433057785, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 1352 + }, + { + "epoch": 1.0806709265175718, + "grad_norm": 0.31380224227905273, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1353 + }, + { + "epoch": 1.0814696485623003, + "grad_norm": 0.2782067060470581, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1354 + }, + { + "epoch": 1.0822683706070289, + "grad_norm": 0.04267412796616554, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1355 + }, + { + "epoch": 1.0830670926517572, + "grad_norm": 0.2687273919582367, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1356 + }, + { + "epoch": 1.0838658146964857, + "grad_norm": 0.3133341073989868, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1357 + }, + { + "epoch": 1.084664536741214, + "grad_norm": 0.11658725887537003, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1358 + }, + { + "epoch": 1.0854632587859425, + "grad_norm": 0.1339937299489975, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1359 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 0.15727631747722626, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1360 + }, + { + "epoch": 1.0870607028753994, + "grad_norm": 0.11759792268276215, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1361 + }, + { + "epoch": 1.0878594249201279, + "grad_norm": 0.11522746086120605, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1362 + }, + { + "epoch": 1.0886581469648562, + "grad_norm": 0.16571135818958282, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1363 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 0.09467484056949615, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1364 + }, + { + "epoch": 1.090255591054313, + "grad_norm": 0.07887586951255798, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1365 + }, + { + "epoch": 1.0910543130990416, + "grad_norm": 0.11297929286956787, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1366 + }, + { + "epoch": 1.09185303514377, + "grad_norm": 0.06402980536222458, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1367 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 0.11947043240070343, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1368 + }, + { + "epoch": 1.093450479233227, + "grad_norm": 0.06244207173585892, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1369 + }, + { + "epoch": 1.0942492012779552, + "grad_norm": 0.08165531605482101, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1370 + }, + { + "epoch": 1.0950479233226837, + "grad_norm": 0.03842553123831749, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 1371 + }, + { + "epoch": 1.095846645367412, + "grad_norm": 0.12175651639699936, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1372 + }, + { + "epoch": 1.0966453674121406, + "grad_norm": 0.1720212697982788, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1373 + }, + { + "epoch": 1.097444089456869, + "grad_norm": 0.15540143847465515, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1374 + }, + { + "epoch": 1.0982428115015974, + "grad_norm": 0.1056036502122879, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1375 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 0.06738443672657013, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1376 + }, + { + "epoch": 1.0998402555910542, + "grad_norm": 0.09600193798542023, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1377 + }, + { + "epoch": 1.1006389776357828, + "grad_norm": 0.11872005462646484, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1378 + }, + { + "epoch": 1.101437699680511, + "grad_norm": 0.04837389290332794, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1379 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 0.11245802789926529, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1380 + }, + { + "epoch": 1.1030351437699681, + "grad_norm": 0.1525758057832718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1381 + }, + { + "epoch": 1.1038338658146964, + "grad_norm": 0.07688060402870178, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1382 + }, + { + "epoch": 1.104632587859425, + "grad_norm": 0.05793362855911255, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1383 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 0.09737680107355118, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1384 + }, + { + "epoch": 1.1062300319488818, + "grad_norm": 0.15511851012706757, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1385 + }, + { + "epoch": 1.1070287539936103, + "grad_norm": 0.14931945502758026, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1386 + }, + { + "epoch": 1.1078274760383386, + "grad_norm": 0.1451406478881836, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1387 + }, + { + "epoch": 1.1086261980830672, + "grad_norm": 0.06013273820281029, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1388 + }, + { + "epoch": 1.1094249201277955, + "grad_norm": 0.08433987945318222, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1389 + }, + { + "epoch": 1.110223642172524, + "grad_norm": 0.12601709365844727, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1390 + }, + { + "epoch": 1.1110223642172523, + "grad_norm": 0.14611507952213287, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1391 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 0.10526898503303528, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1392 + }, + { + "epoch": 1.1126198083067094, + "grad_norm": 0.03592250496149063, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1393 + }, + { + "epoch": 1.1134185303514377, + "grad_norm": 0.07883994281291962, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1394 + }, + { + "epoch": 1.1142172523961662, + "grad_norm": 0.1351863145828247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1395 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 0.10423804074525833, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1396 + }, + { + "epoch": 1.115814696485623, + "grad_norm": 0.05230586603283882, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1397 + }, + { + "epoch": 1.1166134185303513, + "grad_norm": 0.03962033987045288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1398 + }, + { + "epoch": 1.1174121405750799, + "grad_norm": 0.08950864523649216, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1399 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.1326761394739151, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1400 + }, + { + "epoch": 1.1190095846645367, + "grad_norm": 0.1251986175775528, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1401 + }, + { + "epoch": 1.1198083067092652, + "grad_norm": 0.05831597000360489, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1402 + }, + { + "epoch": 1.1206070287539935, + "grad_norm": 0.11382800340652466, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1403 + }, + { + "epoch": 1.121405750798722, + "grad_norm": 0.16290108859539032, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1404 + }, + { + "epoch": 1.1222044728434506, + "grad_norm": 0.1721554696559906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1405 + }, + { + "epoch": 1.123003194888179, + "grad_norm": 0.09426763653755188, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1406 + }, + { + "epoch": 1.1238019169329074, + "grad_norm": 0.037366580218076706, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1407 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 0.07456237077713013, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1408 + }, + { + "epoch": 1.1253993610223643, + "grad_norm": 0.11701856553554535, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1409 + }, + { + "epoch": 1.1261980830670926, + "grad_norm": 0.13261918723583221, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1410 + }, + { + "epoch": 1.126996805111821, + "grad_norm": 0.09014345705509186, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1411 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 0.05398619920015335, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1412 + }, + { + "epoch": 1.128594249201278, + "grad_norm": 0.09375960379838943, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1413 + }, + { + "epoch": 1.1293929712460065, + "grad_norm": 0.09307628124952316, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1414 + }, + { + "epoch": 1.1301916932907348, + "grad_norm": 0.09488195180892944, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1415 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 0.08067089319229126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1416 + }, + { + "epoch": 1.1317891373801916, + "grad_norm": 0.043899055570364, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1417 + }, + { + "epoch": 1.1325878594249201, + "grad_norm": 0.05593986064195633, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1418 + }, + { + "epoch": 1.1333865814696487, + "grad_norm": 0.05736452341079712, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1419 + }, + { + "epoch": 1.134185303514377, + "grad_norm": 0.1092999204993248, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1420 + }, + { + "epoch": 1.1349840255591055, + "grad_norm": 0.18366938829421997, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1421 + }, + { + "epoch": 1.1357827476038338, + "grad_norm": 0.177176833152771, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1422 + }, + { + "epoch": 1.1365814696485623, + "grad_norm": 0.08829191327095032, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1423 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 0.07169382274150848, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1424 + }, + { + "epoch": 1.1381789137380192, + "grad_norm": 0.130388081073761, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1425 + }, + { + "epoch": 1.1389776357827477, + "grad_norm": 0.20726168155670166, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1426 + }, + { + "epoch": 1.139776357827476, + "grad_norm": 0.21683751046657562, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1427 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 0.131125345826149, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1428 + }, + { + "epoch": 1.1413738019169328, + "grad_norm": 0.04309925064444542, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1429 + }, + { + "epoch": 1.1421725239616614, + "grad_norm": 0.14427928626537323, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1430 + }, + { + "epoch": 1.1429712460063897, + "grad_norm": 0.1743481606245041, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1431 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 0.1037210002541542, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1432 + }, + { + "epoch": 1.1445686900958467, + "grad_norm": 0.11162228137254715, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1433 + }, + { + "epoch": 1.145367412140575, + "grad_norm": 0.25445371866226196, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1434 + }, + { + "epoch": 1.1461661341853036, + "grad_norm": 0.2771884799003601, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1435 + }, + { + "epoch": 1.1469648562300319, + "grad_norm": 0.10653509199619293, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1436 + }, + { + "epoch": 1.1477635782747604, + "grad_norm": 0.1745259016752243, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1437 + }, + { + "epoch": 1.148562300319489, + "grad_norm": 0.3151826560497284, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1438 + }, + { + "epoch": 1.1493610223642172, + "grad_norm": 0.23229722678661346, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1439 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 0.06131701543927193, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1440 + }, + { + "epoch": 1.150958466453674, + "grad_norm": 0.28753313422203064, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1441 + }, + { + "epoch": 1.1517571884984026, + "grad_norm": 0.3178791105747223, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1442 + }, + { + "epoch": 1.1525559105431311, + "grad_norm": 0.10008880496025085, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1443 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 0.2418096512556076, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1444 + }, + { + "epoch": 1.154153354632588, + "grad_norm": 0.34728583693504333, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1445 + }, + { + "epoch": 1.1549520766773163, + "grad_norm": 0.2172212153673172, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1446 + }, + { + "epoch": 1.1557507987220448, + "grad_norm": 0.04184277728199959, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1447 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 0.19960719347000122, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1448 + }, + { + "epoch": 1.1573482428115016, + "grad_norm": 0.19261692464351654, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1449 + }, + { + "epoch": 1.15814696485623, + "grad_norm": 0.08326124399900436, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1450 + }, + { + "epoch": 1.1589456869009584, + "grad_norm": 0.08552456647157669, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1451 + }, + { + "epoch": 1.159744408945687, + "grad_norm": 0.07903868705034256, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 1452 + }, + { + "epoch": 1.1605431309904153, + "grad_norm": 0.045095205307006836, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1453 + }, + { + "epoch": 1.1613418530351438, + "grad_norm": 0.08293266594409943, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1454 + }, + { + "epoch": 1.1621405750798721, + "grad_norm": 0.09431439638137817, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1455 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 0.04189104586839676, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1456 + }, + { + "epoch": 1.1637380191693292, + "grad_norm": 0.11492408066987991, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1457 + }, + { + "epoch": 1.1645367412140575, + "grad_norm": 0.16648449003696442, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1458 + }, + { + "epoch": 1.165335463258786, + "grad_norm": 0.1532576084136963, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1459 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 0.07438737154006958, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1460 + }, + { + "epoch": 1.1669329073482428, + "grad_norm": 0.0887872502207756, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 1461 + }, + { + "epoch": 1.1677316293929714, + "grad_norm": 0.17035096883773804, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1462 + }, + { + "epoch": 1.1685303514376997, + "grad_norm": 0.12702526152133942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1463 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 0.04788994789123535, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1464 + }, + { + "epoch": 1.1701277955271565, + "grad_norm": 0.15093912184238434, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1465 + }, + { + "epoch": 1.170926517571885, + "grad_norm": 0.1428089439868927, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1466 + }, + { + "epoch": 1.1717252396166133, + "grad_norm": 0.039421554654836655, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1467 + }, + { + "epoch": 1.1725239616613419, + "grad_norm": 0.09461840242147446, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1468 + }, + { + "epoch": 1.1733226837060702, + "grad_norm": 0.07272787392139435, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1469 + }, + { + "epoch": 1.1741214057507987, + "grad_norm": 0.10863790661096573, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1470 + }, + { + "epoch": 1.1749201277955272, + "grad_norm": 0.211805522441864, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1471 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 0.2124311476945877, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1472 + }, + { + "epoch": 1.176517571884984, + "grad_norm": 0.14013712108135223, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1473 + }, + { + "epoch": 1.1773162939297124, + "grad_norm": 0.10768178105354309, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1474 + }, + { + "epoch": 1.178115015974441, + "grad_norm": 0.07961699366569519, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1475 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 0.0772516280412674, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1476 + }, + { + "epoch": 1.1797124600638977, + "grad_norm": 0.11957084387540817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1477 + }, + { + "epoch": 1.1805111821086263, + "grad_norm": 0.1976107954978943, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1478 + }, + { + "epoch": 1.1813099041533546, + "grad_norm": 0.20915871858596802, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1479 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 0.10857495665550232, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1480 + }, + { + "epoch": 1.1829073482428114, + "grad_norm": 0.09961260855197906, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1481 + }, + { + "epoch": 1.18370607028754, + "grad_norm": 0.11908663064241409, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1482 + }, + { + "epoch": 1.1845047923322685, + "grad_norm": 0.0982719212770462, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1483 + }, + { + "epoch": 1.1853035143769968, + "grad_norm": 0.05869903787970543, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1484 + }, + { + "epoch": 1.1861022364217253, + "grad_norm": 0.14943145215511322, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1485 + }, + { + "epoch": 1.1869009584664536, + "grad_norm": 0.1761479526758194, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1486 + }, + { + "epoch": 1.1876996805111821, + "grad_norm": 0.1393168866634369, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1487 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 0.0473988801240921, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1488 + }, + { + "epoch": 1.189297124600639, + "grad_norm": 0.20789027214050293, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1489 + }, + { + "epoch": 1.1900958466453675, + "grad_norm": 0.29456260800361633, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1490 + }, + { + "epoch": 1.1908945686900958, + "grad_norm": 0.1875244528055191, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1491 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 0.052052468061447144, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1492 + }, + { + "epoch": 1.1924920127795526, + "grad_norm": 0.1376652717590332, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1493 + }, + { + "epoch": 1.1932907348242812, + "grad_norm": 0.1656588762998581, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1494 + }, + { + "epoch": 1.1940894568690097, + "grad_norm": 0.07063707709312439, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1495 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 0.12681347131729126, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1496 + }, + { + "epoch": 1.1956869009584665, + "grad_norm": 0.17560099065303802, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1497 + }, + { + "epoch": 1.1964856230031948, + "grad_norm": 0.10635025054216385, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1498 + }, + { + "epoch": 1.1972843450479234, + "grad_norm": 0.061567965894937515, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1499 + }, + { + "epoch": 1.1980830670926517, + "grad_norm": 0.12346719950437546, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1500 + }, + { + "epoch": 1.1988817891373802, + "grad_norm": 0.07105513662099838, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1501 + }, + { + "epoch": 1.1996805111821087, + "grad_norm": 0.07719466835260391, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1502 + }, + { + "epoch": 1.200479233226837, + "grad_norm": 0.1478763371706009, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1503 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 0.1383642554283142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1504 + }, + { + "epoch": 1.2020766773162939, + "grad_norm": 0.05519767478108406, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1505 + }, + { + "epoch": 1.2028753993610224, + "grad_norm": 0.06807537376880646, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1506 + }, + { + "epoch": 1.2036741214057507, + "grad_norm": 0.10652226209640503, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1507 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 0.044540517032146454, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1508 + }, + { + "epoch": 1.2052715654952078, + "grad_norm": 0.12266546487808228, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1509 + }, + { + "epoch": 1.206070287539936, + "grad_norm": 0.1997641921043396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1510 + }, + { + "epoch": 1.2068690095846646, + "grad_norm": 0.1924593299627304, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1511 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 0.09990391880273819, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1512 + }, + { + "epoch": 1.2084664536741214, + "grad_norm": 0.04226391762495041, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1513 + }, + { + "epoch": 1.20926517571885, + "grad_norm": 0.07116132974624634, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1514 + }, + { + "epoch": 1.2100638977635783, + "grad_norm": 0.046046894043684006, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1515 + }, + { + "epoch": 1.2108626198083068, + "grad_norm": 0.039608217775821686, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1516 + }, + { + "epoch": 1.211661341853035, + "grad_norm": 0.055937573313713074, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1517 + }, + { + "epoch": 1.2124600638977636, + "grad_norm": 0.09269243478775024, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1518 + }, + { + "epoch": 1.213258785942492, + "grad_norm": 0.04349381849169731, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1519 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 0.08543939888477325, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1520 + }, + { + "epoch": 1.2148562300319488, + "grad_norm": 0.1829536110162735, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1521 + }, + { + "epoch": 1.2156549520766773, + "grad_norm": 0.23422624170780182, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1522 + }, + { + "epoch": 1.2164536741214058, + "grad_norm": 0.13391408324241638, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1523 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 0.07262124121189117, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1524 + }, + { + "epoch": 1.2180511182108626, + "grad_norm": 0.1842898577451706, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1525 + }, + { + "epoch": 1.218849840255591, + "grad_norm": 0.16982080042362213, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1526 + }, + { + "epoch": 1.2196485623003195, + "grad_norm": 0.07628878951072693, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1527 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 0.07903175801038742, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1528 + }, + { + "epoch": 1.2212460063897763, + "grad_norm": 0.1874074637889862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1529 + }, + { + "epoch": 1.2220447284345048, + "grad_norm": 0.2084639072418213, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1530 + }, + { + "epoch": 1.2228434504792332, + "grad_norm": 0.161276176571846, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1531 + }, + { + "epoch": 1.2236421725239617, + "grad_norm": 0.07408371567726135, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1532 + }, + { + "epoch": 1.2244408945686902, + "grad_norm": 0.06918113678693771, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1533 + }, + { + "epoch": 1.2252396166134185, + "grad_norm": 0.15813148021697998, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1534 + }, + { + "epoch": 1.226038338658147, + "grad_norm": 0.1454530507326126, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 1535 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 0.07441768050193787, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1536 + }, + { + "epoch": 1.2276357827476039, + "grad_norm": 0.19151917099952698, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1537 + }, + { + "epoch": 1.2284345047923322, + "grad_norm": 0.22358526289463043, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1538 + }, + { + "epoch": 1.2292332268370607, + "grad_norm": 0.12382426857948303, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1539 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 0.09593929350376129, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1540 + }, + { + "epoch": 1.2308306709265175, + "grad_norm": 0.32887372374534607, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1541 + }, + { + "epoch": 1.231629392971246, + "grad_norm": 0.3910810351371765, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1542 + }, + { + "epoch": 1.2324281150159744, + "grad_norm": 0.21341568231582642, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1543 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 0.10242578387260437, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1544 + }, + { + "epoch": 1.2340255591054312, + "grad_norm": 0.2556541860103607, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 1.2348242811501597, + "grad_norm": 0.22671715915203094, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1546 + }, + { + "epoch": 1.2356230031948883, + "grad_norm": 0.05781029909849167, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1547 + }, + { + "epoch": 1.2364217252396166, + "grad_norm": 0.2803215980529785, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1548 + }, + { + "epoch": 1.237220447284345, + "grad_norm": 0.3391420543193817, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1549 + }, + { + "epoch": 1.2380191693290734, + "grad_norm": 0.17648665606975555, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1550 + }, + { + "epoch": 1.238817891373802, + "grad_norm": 0.14975208044052124, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1551 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 0.2930659353733063, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1552 + }, + { + "epoch": 1.2404153354632588, + "grad_norm": 0.16080376505851746, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1553 + }, + { + "epoch": 1.2412140575079873, + "grad_norm": 0.1765553057193756, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1554 + }, + { + "epoch": 1.2420127795527156, + "grad_norm": 0.43610313534736633, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1555 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 0.3448547124862671, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1556 + }, + { + "epoch": 1.2436102236421724, + "grad_norm": 0.11257574707269669, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1557 + }, + { + "epoch": 1.244408945686901, + "grad_norm": 0.2212686389684677, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1558 + }, + { + "epoch": 1.2452076677316293, + "grad_norm": 0.24576987326145172, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 1559 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 0.07592078298330307, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1560 + }, + { + "epoch": 1.2468051118210863, + "grad_norm": 0.18566438555717468, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1561 + }, + { + "epoch": 1.2476038338658146, + "grad_norm": 0.2345304936170578, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1562 + }, + { + "epoch": 1.2484025559105432, + "grad_norm": 0.12168031930923462, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1563 + }, + { + "epoch": 1.2492012779552715, + "grad_norm": 0.10168169438838959, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1564 + }, + { + "epoch": 1.25, + "grad_norm": 0.14832071959972382, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1565 + }, + { + "epoch": 1.2507987220447285, + "grad_norm": 0.04516097158193588, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1566 + }, + { + "epoch": 1.2515974440894568, + "grad_norm": 0.14377422630786896, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1567 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 0.12483170628547668, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1568 + }, + { + "epoch": 1.2531948881789137, + "grad_norm": 0.06861971318721771, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1569 + }, + { + "epoch": 1.2539936102236422, + "grad_norm": 0.1124153807759285, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1570 + }, + { + "epoch": 1.2547923322683707, + "grad_norm": 0.16883404552936554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1571 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 0.09533397108316422, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1572 + }, + { + "epoch": 1.2563897763578276, + "grad_norm": 0.09215923398733139, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1573 + }, + { + "epoch": 1.2571884984025559, + "grad_norm": 0.12701599299907684, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1574 + }, + { + "epoch": 1.2579872204472844, + "grad_norm": 0.09106232225894928, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1575 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 0.047954440116882324, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1576 + }, + { + "epoch": 1.2595846645367412, + "grad_norm": 0.13917528092861176, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1577 + }, + { + "epoch": 1.2603833865814695, + "grad_norm": 0.17694029211997986, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1578 + }, + { + "epoch": 1.261182108626198, + "grad_norm": 0.11021065711975098, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1579 + }, + { + "epoch": 1.2619808306709266, + "grad_norm": 0.03982831537723541, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1580 + }, + { + "epoch": 1.262779552715655, + "grad_norm": 0.08759493380784988, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1581 + }, + { + "epoch": 1.2635782747603834, + "grad_norm": 0.04797520861029625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1582 + }, + { + "epoch": 1.2643769968051117, + "grad_norm": 0.049942485988140106, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 1583 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 0.04236803576350212, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1584 + }, + { + "epoch": 1.2659744408945688, + "grad_norm": 0.05938104912638664, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1585 + }, + { + "epoch": 1.266773162939297, + "grad_norm": 0.07487885653972626, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1586 + }, + { + "epoch": 1.2675718849840256, + "grad_norm": 0.063072569668293, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1587 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 0.07140504568815231, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1588 + }, + { + "epoch": 1.2691693290734825, + "grad_norm": 0.04790132865309715, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1589 + }, + { + "epoch": 1.269968051118211, + "grad_norm": 0.050013668835163116, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1590 + }, + { + "epoch": 1.2707667731629393, + "grad_norm": 0.0559731163084507, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1591 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 0.04633013904094696, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1592 + }, + { + "epoch": 1.2723642172523961, + "grad_norm": 0.05252271518111229, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1593 + }, + { + "epoch": 1.2731629392971247, + "grad_norm": 0.0902840718626976, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1594 + }, + { + "epoch": 1.273961661341853, + "grad_norm": 0.07961871474981308, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1595 + }, + { + "epoch": 1.2747603833865815, + "grad_norm": 0.07653608173131943, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1596 + }, + { + "epoch": 1.2755591054313098, + "grad_norm": 0.15634121000766754, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1597 + }, + { + "epoch": 1.2763578274760383, + "grad_norm": 0.2045222818851471, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1598 + }, + { + "epoch": 1.2771565495207668, + "grad_norm": 0.1769608110189438, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1599 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.09675133973360062, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1600 + }, + { + "epoch": 1.2787539936102237, + "grad_norm": 0.055832285434007645, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1601 + }, + { + "epoch": 1.279552715654952, + "grad_norm": 0.09108291566371918, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1602 + }, + { + "epoch": 1.2803514376996805, + "grad_norm": 0.10872901976108551, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1603 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 0.08771848678588867, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1604 + }, + { + "epoch": 1.2819488817891374, + "grad_norm": 0.0731026753783226, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1605 + }, + { + "epoch": 1.2827476038338659, + "grad_norm": 0.040664345026016235, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1606 + }, + { + "epoch": 1.2835463258785942, + "grad_norm": 0.06111081317067146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1607 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 0.08753795176744461, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1608 + }, + { + "epoch": 1.2851437699680512, + "grad_norm": 0.07113729417324066, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1609 + }, + { + "epoch": 1.2859424920127795, + "grad_norm": 0.05469372868537903, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1610 + }, + { + "epoch": 1.2867412140575079, + "grad_norm": 0.05748649686574936, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1611 + }, + { + "epoch": 1.2875399361022364, + "grad_norm": 0.05832446366548538, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1612 + }, + { + "epoch": 1.288338658146965, + "grad_norm": 0.06085522472858429, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1613 + }, + { + "epoch": 1.2891373801916932, + "grad_norm": 0.08154775947332382, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1614 + }, + { + "epoch": 1.2899361022364217, + "grad_norm": 0.11568816751241684, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1615 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 0.06356564909219742, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1616 + }, + { + "epoch": 1.2915335463258786, + "grad_norm": 0.08187399804592133, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1617 + }, + { + "epoch": 1.292332268370607, + "grad_norm": 0.05326744168996811, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1618 + }, + { + "epoch": 1.2931309904153354, + "grad_norm": 0.05407040938735008, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1619 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 0.07292867451906204, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 1620 + }, + { + "epoch": 1.2947284345047922, + "grad_norm": 0.09447437524795532, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1621 + }, + { + "epoch": 1.2955271565495208, + "grad_norm": 0.0592079721391201, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1622 + }, + { + "epoch": 1.2963258785942493, + "grad_norm": 0.052008479833602905, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1623 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 0.06381972879171371, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1624 + }, + { + "epoch": 1.2979233226837061, + "grad_norm": 0.07434900850057602, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1625 + }, + { + "epoch": 1.2987220447284344, + "grad_norm": 0.06477486342191696, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1626 + }, + { + "epoch": 1.299520766773163, + "grad_norm": 0.13730554282665253, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1627 + }, + { + "epoch": 1.3003194888178915, + "grad_norm": 0.1683935821056366, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1628 + }, + { + "epoch": 1.3011182108626198, + "grad_norm": 0.08616848289966583, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1629 + }, + { + "epoch": 1.3019169329073481, + "grad_norm": 0.10220590978860855, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1630 + }, + { + "epoch": 1.3027156549520766, + "grad_norm": 0.22036917507648468, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1631 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.2277965545654297, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1632 + }, + { + "epoch": 1.3043130990415335, + "grad_norm": 0.10426606982946396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1633 + }, + { + "epoch": 1.305111821086262, + "grad_norm": 0.06641022861003876, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1634 + }, + { + "epoch": 1.3059105431309903, + "grad_norm": 0.09100072830915451, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1635 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 0.06551069766283035, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1636 + }, + { + "epoch": 1.3075079872204474, + "grad_norm": 0.04397547245025635, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1637 + }, + { + "epoch": 1.3083067092651757, + "grad_norm": 0.0781746581196785, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1638 + }, + { + "epoch": 1.3091054313099042, + "grad_norm": 0.07852843403816223, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1639 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 0.09224545955657959, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1640 + }, + { + "epoch": 1.310702875399361, + "grad_norm": 0.10179189592599869, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1641 + }, + { + "epoch": 1.3115015974440896, + "grad_norm": 0.07562009245157242, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1642 + }, + { + "epoch": 1.3123003194888179, + "grad_norm": 0.15463820099830627, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1643 + }, + { + "epoch": 1.3130990415335464, + "grad_norm": 0.05742334946990013, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 1644 + }, + { + "epoch": 1.3138977635782747, + "grad_norm": 0.09010195732116699, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1645 + }, + { + "epoch": 1.3146964856230032, + "grad_norm": 0.04284297674894333, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1646 + }, + { + "epoch": 1.3154952076677318, + "grad_norm": 0.07167239487171173, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1647 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 0.04978404566645622, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1648 + }, + { + "epoch": 1.3170926517571884, + "grad_norm": 0.2888668477535248, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1649 + }, + { + "epoch": 1.317891373801917, + "grad_norm": 0.13716880977153778, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1650 + }, + { + "epoch": 1.3186900958466454, + "grad_norm": 0.13081762194633484, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1651 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 0.046977054327726364, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1652 + }, + { + "epoch": 1.3202875399361023, + "grad_norm": 0.1331615000963211, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 1653 + }, + { + "epoch": 1.3210862619808306, + "grad_norm": 0.21066126227378845, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 1654 + }, + { + "epoch": 1.321884984025559, + "grad_norm": 0.23017194867134094, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1655 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 0.20224629342556, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1656 + }, + { + "epoch": 1.323482428115016, + "grad_norm": 0.09836700558662415, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1657 + }, + { + "epoch": 1.3242811501597445, + "grad_norm": 0.10621663928031921, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1658 + }, + { + "epoch": 1.3250798722044728, + "grad_norm": 0.25464868545532227, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1659 + }, + { + "epoch": 1.3258785942492013, + "grad_norm": 0.39965251088142395, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1660 + }, + { + "epoch": 1.3266773162939298, + "grad_norm": 0.4731796383857727, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1661 + }, + { + "epoch": 1.3274760383386581, + "grad_norm": 0.4287014603614807, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1662 + }, + { + "epoch": 1.3282747603833867, + "grad_norm": 0.15660974383354187, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1663 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.14340882003307343, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1664 + }, + { + "epoch": 1.3298722044728435, + "grad_norm": 0.23041795194149017, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1665 + }, + { + "epoch": 1.330670926517572, + "grad_norm": 0.14607569575309753, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1666 + }, + { + "epoch": 1.3314696485623003, + "grad_norm": 0.0620175264775753, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1667 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 0.1722227782011032, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1668 + }, + { + "epoch": 1.3330670926517572, + "grad_norm": 0.17676329612731934, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1669 + }, + { + "epoch": 1.3338658146964857, + "grad_norm": 0.10175948590040207, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1670 + }, + { + "epoch": 1.334664536741214, + "grad_norm": 0.052259646356105804, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1671 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 0.11740414053201675, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1672 + }, + { + "epoch": 1.3362619808306708, + "grad_norm": 0.13614653050899506, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1673 + }, + { + "epoch": 1.3370607028753994, + "grad_norm": 0.12058388441801071, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 1674 + }, + { + "epoch": 1.3378594249201279, + "grad_norm": 0.12473122030496597, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1675 + }, + { + "epoch": 1.3386581469648562, + "grad_norm": 0.11198705434799194, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1676 + }, + { + "epoch": 1.3394568690095847, + "grad_norm": 0.06745828688144684, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1677 + }, + { + "epoch": 1.340255591054313, + "grad_norm": 0.06042877584695816, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1678 + }, + { + "epoch": 1.3410543130990416, + "grad_norm": 0.08762289583683014, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1679 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 0.07612926512956619, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1680 + }, + { + "epoch": 1.3426517571884984, + "grad_norm": 0.16108228266239166, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1681 + }, + { + "epoch": 1.343450479233227, + "grad_norm": 0.12803438305854797, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1682 + }, + { + "epoch": 1.3442492012779552, + "grad_norm": 0.09190207719802856, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1683 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 0.07201807200908661, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1684 + }, + { + "epoch": 1.3458466453674123, + "grad_norm": 0.06885793805122375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1685 + }, + { + "epoch": 1.3466453674121406, + "grad_norm": 0.06998719274997711, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1686 + }, + { + "epoch": 1.3474440894568689, + "grad_norm": 0.08072122186422348, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1687 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 0.1314389705657959, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1688 + }, + { + "epoch": 1.349041533546326, + "grad_norm": 0.1393643617630005, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1689 + }, + { + "epoch": 1.3498402555910542, + "grad_norm": 0.1482846736907959, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1690 + }, + { + "epoch": 1.3506389776357828, + "grad_norm": 0.10097873955965042, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1691 + }, + { + "epoch": 1.351437699680511, + "grad_norm": 0.16020123660564423, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1692 + }, + { + "epoch": 1.3522364217252396, + "grad_norm": 0.4032374322414398, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1693 + }, + { + "epoch": 1.3530351437699681, + "grad_norm": 0.21653197705745697, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1694 + }, + { + "epoch": 1.3538338658146964, + "grad_norm": 0.18634478747844696, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1695 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 0.06293921917676926, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1696 + }, + { + "epoch": 1.3554313099041533, + "grad_norm": 0.09862471371889114, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1697 + }, + { + "epoch": 1.3562300319488818, + "grad_norm": 0.17562821507453918, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1698 + }, + { + "epoch": 1.3570287539936103, + "grad_norm": 0.17277459800243378, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1699 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 0.06883158534765244, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1700 + }, + { + "epoch": 1.3586261980830672, + "grad_norm": 0.06487718969583511, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1701 + }, + { + "epoch": 1.3594249201277955, + "grad_norm": 0.08988886326551437, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1702 + }, + { + "epoch": 1.360223642172524, + "grad_norm": 0.05164919048547745, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1703 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 0.143778458237648, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1704 + }, + { + "epoch": 1.3618210862619808, + "grad_norm": 0.21736390888690948, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1705 + }, + { + "epoch": 1.3626198083067091, + "grad_norm": 0.2496086061000824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1706 + }, + { + "epoch": 1.3634185303514377, + "grad_norm": 0.21299317479133606, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1707 + }, + { + "epoch": 1.3642172523961662, + "grad_norm": 0.06845723092556, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1708 + }, + { + "epoch": 1.3650159744408945, + "grad_norm": 0.14018614590168, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1709 + }, + { + "epoch": 1.365814696485623, + "grad_norm": 0.1971539407968521, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1710 + }, + { + "epoch": 1.3666134185303513, + "grad_norm": 0.10819724202156067, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 1711 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 0.12900666892528534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1712 + }, + { + "epoch": 1.3682108626198084, + "grad_norm": 0.17080886662006378, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1713 + }, + { + "epoch": 1.3690095846645367, + "grad_norm": 0.22689902782440186, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1714 + }, + { + "epoch": 1.3698083067092652, + "grad_norm": 0.2200036197900772, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1715 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 0.15193268656730652, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1716 + }, + { + "epoch": 1.371405750798722, + "grad_norm": 0.057297177612781525, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1717 + }, + { + "epoch": 1.3722044728434506, + "grad_norm": 0.12024576961994171, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1718 + }, + { + "epoch": 1.373003194888179, + "grad_norm": 0.16183575987815857, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1719 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 0.14740106463432312, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 1720 + }, + { + "epoch": 1.3746006389776357, + "grad_norm": 0.09009548276662827, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1721 + }, + { + "epoch": 1.3753993610223643, + "grad_norm": 0.05091484636068344, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1722 + }, + { + "epoch": 1.3761980830670926, + "grad_norm": 0.05887647345662117, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1723 + }, + { + "epoch": 1.376996805111821, + "grad_norm": 0.06313642859458923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1724 + }, + { + "epoch": 1.3777955271565494, + "grad_norm": 0.06496263295412064, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1725 + }, + { + "epoch": 1.378594249201278, + "grad_norm": 0.06047922000288963, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1726 + }, + { + "epoch": 1.3793929712460065, + "grad_norm": 0.05579136312007904, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 1727 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 0.05931869521737099, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1728 + }, + { + "epoch": 1.3809904153354633, + "grad_norm": 0.049043234437704086, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1729 + }, + { + "epoch": 1.3817891373801916, + "grad_norm": 0.051883842796087265, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1730 + }, + { + "epoch": 1.3825878594249201, + "grad_norm": 0.07195441424846649, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1731 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 0.12339463829994202, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1732 + }, + { + "epoch": 1.384185303514377, + "grad_norm": 0.16951170563697815, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1733 + }, + { + "epoch": 1.3849840255591055, + "grad_norm": 0.1773078590631485, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1734 + }, + { + "epoch": 1.3857827476038338, + "grad_norm": 0.15160880982875824, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1735 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 0.12933489680290222, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1736 + }, + { + "epoch": 1.3873801916932909, + "grad_norm": 0.05910791456699371, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1737 + }, + { + "epoch": 1.3881789137380192, + "grad_norm": 0.06765501946210861, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1738 + }, + { + "epoch": 1.3889776357827475, + "grad_norm": 0.09179043024778366, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1739 + }, + { + "epoch": 1.389776357827476, + "grad_norm": 0.08842387795448303, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 1740 + }, + { + "epoch": 1.3905750798722045, + "grad_norm": 0.07700884342193604, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1741 + }, + { + "epoch": 1.3913738019169328, + "grad_norm": 0.045392196625471115, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1742 + }, + { + "epoch": 1.3921725239616614, + "grad_norm": 0.11977320909500122, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1743 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 0.1882479041814804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1744 + }, + { + "epoch": 1.3937699680511182, + "grad_norm": 0.25021475553512573, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1745 + }, + { + "epoch": 1.3945686900958467, + "grad_norm": 0.23374556005001068, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1746 + }, + { + "epoch": 1.395367412140575, + "grad_norm": 0.1016339659690857, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1747 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 0.1340985745191574, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1748 + }, + { + "epoch": 1.3969648562300319, + "grad_norm": 0.21048963069915771, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1749 + }, + { + "epoch": 1.3977635782747604, + "grad_norm": 0.20711666345596313, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1750 + }, + { + "epoch": 1.398562300319489, + "grad_norm": 0.19101384282112122, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1751 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 0.17655788362026215, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1752 + }, + { + "epoch": 1.4001597444089458, + "grad_norm": 0.11994078010320663, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1753 + }, + { + "epoch": 1.400958466453674, + "grad_norm": 0.09805315732955933, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1754 + }, + { + "epoch": 1.4017571884984026, + "grad_norm": 0.07474519312381744, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1755 + }, + { + "epoch": 1.4025559105431311, + "grad_norm": 0.11269772797822952, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1756 + }, + { + "epoch": 1.4033546325878594, + "grad_norm": 0.08900775015354156, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1757 + }, + { + "epoch": 1.4041533546325877, + "grad_norm": 0.05614674836397171, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1758 + }, + { + "epoch": 1.4049520766773163, + "grad_norm": 0.12895621359348297, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1759 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 0.16433797776699066, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1760 + }, + { + "epoch": 1.406549520766773, + "grad_norm": 0.20009422302246094, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1761 + }, + { + "epoch": 1.4073482428115016, + "grad_norm": 0.146495059132576, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1762 + }, + { + "epoch": 1.40814696485623, + "grad_norm": 0.07518120110034943, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1763 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 0.09864111244678497, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1764 + }, + { + "epoch": 1.409744408945687, + "grad_norm": 0.20213425159454346, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 1765 + }, + { + "epoch": 1.4105431309904153, + "grad_norm": 0.17369656264781952, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1766 + }, + { + "epoch": 1.4113418530351438, + "grad_norm": 0.06627536565065384, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1767 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 0.09098218381404877, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1768 + }, + { + "epoch": 1.4129392971246006, + "grad_norm": 0.11730248481035233, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1769 + }, + { + "epoch": 1.4137380191693292, + "grad_norm": 0.07061973959207535, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1770 + }, + { + "epoch": 1.4145367412140575, + "grad_norm": 0.10279946774244308, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1771 + }, + { + "epoch": 1.415335463258786, + "grad_norm": 0.18082919716835022, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1772 + }, + { + "epoch": 1.4161341853035143, + "grad_norm": 0.1592867076396942, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1773 + }, + { + "epoch": 1.4169329073482428, + "grad_norm": 0.09976492077112198, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1774 + }, + { + "epoch": 1.4177316293929714, + "grad_norm": 0.060737378895282745, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1775 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 0.06248186528682709, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1776 + }, + { + "epoch": 1.419329073482428, + "grad_norm": 0.13300968706607819, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1777 + }, + { + "epoch": 1.4201277955271565, + "grad_norm": 0.1979697346687317, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1778 + }, + { + "epoch": 1.420926517571885, + "grad_norm": 0.23268306255340576, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1779 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 0.18313626945018768, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1780 + }, + { + "epoch": 1.4225239616613419, + "grad_norm": 0.08110051602125168, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1781 + }, + { + "epoch": 1.4233226837060702, + "grad_norm": 0.09732743352651596, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1782 + }, + { + "epoch": 1.4241214057507987, + "grad_norm": 0.1656067669391632, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1783 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 0.1959427297115326, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1784 + }, + { + "epoch": 1.4257188498402555, + "grad_norm": 0.17609809339046478, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1785 + }, + { + "epoch": 1.426517571884984, + "grad_norm": 0.0999840646982193, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1786 + }, + { + "epoch": 1.4273162939297124, + "grad_norm": 0.06475909799337387, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1787 + }, + { + "epoch": 1.428115015974441, + "grad_norm": 0.1364496946334839, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1788 + }, + { + "epoch": 1.4289137380191694, + "grad_norm": 0.21113638579845428, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 1789 + }, + { + "epoch": 1.4297124600638977, + "grad_norm": 0.25998085737228394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1790 + }, + { + "epoch": 1.4305111821086263, + "grad_norm": 0.24930700659751892, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1791 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 0.131307452917099, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1792 + }, + { + "epoch": 1.432108626198083, + "grad_norm": 0.0739457756280899, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1793 + }, + { + "epoch": 1.4329073482428116, + "grad_norm": 0.2009744644165039, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1794 + }, + { + "epoch": 1.43370607028754, + "grad_norm": 0.28875023126602173, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1795 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 0.25421038269996643, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1796 + }, + { + "epoch": 1.4353035143769968, + "grad_norm": 0.09670932590961456, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1797 + }, + { + "epoch": 1.4361022364217253, + "grad_norm": 0.11264955252408981, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1798 + }, + { + "epoch": 1.4369009584664536, + "grad_norm": 0.1401909440755844, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1799 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 0.08234099298715591, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1800 + }, + { + "epoch": 1.4384984025559104, + "grad_norm": 0.05028436705470085, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1801 + }, + { + "epoch": 1.439297124600639, + "grad_norm": 0.04673704132437706, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1802 + }, + { + "epoch": 1.4400958466453675, + "grad_norm": 0.07369101047515869, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 1803 + }, + { + "epoch": 1.4408945686900958, + "grad_norm": 0.161424919962883, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1804 + }, + { + "epoch": 1.4416932907348243, + "grad_norm": 0.13576306402683258, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1805 + }, + { + "epoch": 1.4424920127795526, + "grad_norm": 0.063505619764328, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 1806 + }, + { + "epoch": 1.4432907348242812, + "grad_norm": 0.07231617718935013, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1807 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 0.1698617935180664, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1808 + }, + { + "epoch": 1.444888178913738, + "grad_norm": 0.16520395874977112, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1809 + }, + { + "epoch": 1.4456869009584665, + "grad_norm": 0.058485522866249084, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1810 + }, + { + "epoch": 1.4464856230031948, + "grad_norm": 0.0816773921251297, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1811 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 0.15307661890983582, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1812 + }, + { + "epoch": 1.4480830670926519, + "grad_norm": 0.20710408687591553, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 1813 + }, + { + "epoch": 1.4488817891373802, + "grad_norm": 0.1786869764328003, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1814 + }, + { + "epoch": 1.4496805111821085, + "grad_norm": 0.07363469898700714, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1815 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 0.10158272087574005, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1816 + }, + { + "epoch": 1.4512779552715656, + "grad_norm": 0.14304493367671967, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1817 + }, + { + "epoch": 1.4520766773162939, + "grad_norm": 0.11782495677471161, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1818 + }, + { + "epoch": 1.4528753993610224, + "grad_norm": 0.09340433776378632, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1819 + }, + { + "epoch": 1.4536741214057507, + "grad_norm": 0.08881603926420212, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1820 + }, + { + "epoch": 1.4544728434504792, + "grad_norm": 0.1377323865890503, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1821 + }, + { + "epoch": 1.4552715654952078, + "grad_norm": 0.1137915700674057, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1822 + }, + { + "epoch": 1.456070287539936, + "grad_norm": 0.08219580352306366, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1823 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 0.048282165080308914, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 1824 + }, + { + "epoch": 1.457667731629393, + "grad_norm": 0.07061316817998886, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1825 + }, + { + "epoch": 1.4584664536741214, + "grad_norm": 0.09383007138967514, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1826 + }, + { + "epoch": 1.45926517571885, + "grad_norm": 0.10688310861587524, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1827 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 0.09751323610544205, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 1828 + }, + { + "epoch": 1.4608626198083068, + "grad_norm": 0.10437846183776855, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1829 + }, + { + "epoch": 1.461661341853035, + "grad_norm": 0.13903124630451202, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1830 + }, + { + "epoch": 1.4624600638977636, + "grad_norm": 0.09480495005846024, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1831 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 0.062304843217134476, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1832 + }, + { + "epoch": 1.4640575079872205, + "grad_norm": 0.13482356071472168, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1833 + }, + { + "epoch": 1.4648562300319488, + "grad_norm": 0.2302182912826538, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1834 + }, + { + "epoch": 1.4656549520766773, + "grad_norm": 0.28565964102745056, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1835 + }, + { + "epoch": 1.4664536741214058, + "grad_norm": 0.28437626361846924, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1836 + }, + { + "epoch": 1.4672523961661341, + "grad_norm": 0.20637334883213043, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1837 + }, + { + "epoch": 1.4680511182108626, + "grad_norm": 0.08829299360513687, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1838 + }, + { + "epoch": 1.468849840255591, + "grad_norm": 0.06338132172822952, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 1839 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 0.13094602525234222, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1840 + }, + { + "epoch": 1.470447284345048, + "grad_norm": 0.15911467373371124, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1841 + }, + { + "epoch": 1.4712460063897763, + "grad_norm": 0.10913829505443573, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 1842 + }, + { + "epoch": 1.4720447284345048, + "grad_norm": 0.06934744864702225, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1843 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.07930968701839447, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1844 + }, + { + "epoch": 1.4736421725239617, + "grad_norm": 0.11225491017103195, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1845 + }, + { + "epoch": 1.4744408945686902, + "grad_norm": 0.12815739214420319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1846 + }, + { + "epoch": 1.4752396166134185, + "grad_norm": 0.0943179577589035, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1847 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 0.051353566348552704, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1848 + }, + { + "epoch": 1.4768370607028753, + "grad_norm": 0.10284367203712463, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1849 + }, + { + "epoch": 1.4776357827476039, + "grad_norm": 0.18345551192760468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1850 + }, + { + "epoch": 1.4784345047923324, + "grad_norm": 0.19532762467861176, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1851 + }, + { + "epoch": 1.4792332268370607, + "grad_norm": 0.12518467009067535, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1852 + }, + { + "epoch": 1.480031948881789, + "grad_norm": 0.05363085865974426, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1853 + }, + { + "epoch": 1.4808306709265175, + "grad_norm": 0.18222568929195404, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 1854 + }, + { + "epoch": 1.481629392971246, + "grad_norm": 0.19992542266845703, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1855 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 0.1724570095539093, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1856 + }, + { + "epoch": 1.483226837060703, + "grad_norm": 0.04096012935042381, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1857 + }, + { + "epoch": 1.4840255591054312, + "grad_norm": 0.15409474074840546, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1858 + }, + { + "epoch": 1.4848242811501597, + "grad_norm": 0.29238876700401306, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1859 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 0.35619401931762695, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1860 + }, + { + "epoch": 1.4864217252396166, + "grad_norm": 0.2790282964706421, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1861 + }, + { + "epoch": 1.487220447284345, + "grad_norm": 0.0809629037976265, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1862 + }, + { + "epoch": 1.4880191693290734, + "grad_norm": 0.1827513724565506, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1863 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 0.2284395545721054, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1864 + }, + { + "epoch": 1.4896166134185305, + "grad_norm": 0.11697912216186523, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1865 + }, + { + "epoch": 1.4904153354632588, + "grad_norm": 0.08668534457683563, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 1866 + }, + { + "epoch": 1.4912140575079873, + "grad_norm": 0.19793611764907837, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1867 + }, + { + "epoch": 1.4920127795527156, + "grad_norm": 0.18775872886180878, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1868 + }, + { + "epoch": 1.4928115015974441, + "grad_norm": 0.07068412005901337, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1869 + }, + { + "epoch": 1.4936102236421724, + "grad_norm": 0.07640416920185089, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1870 + }, + { + "epoch": 1.494408945686901, + "grad_norm": 0.1333264708518982, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1871 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 0.13000380992889404, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1872 + }, + { + "epoch": 1.4960063897763578, + "grad_norm": 0.05382491648197174, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1873 + }, + { + "epoch": 1.4968051118210863, + "grad_norm": 0.12773285806179047, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1874 + }, + { + "epoch": 1.4976038338658146, + "grad_norm": 0.2441176027059555, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1875 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 0.26628851890563965, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1876 + }, + { + "epoch": 1.4992012779552715, + "grad_norm": 0.1295953392982483, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1877 + }, + { + "epoch": 1.5, + "grad_norm": 0.10860511660575867, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1878 + }, + { + "epoch": 1.5007987220447285, + "grad_norm": 0.25177180767059326, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1879 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 0.2379150688648224, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1880 + }, + { + "epoch": 1.5023961661341851, + "grad_norm": 0.101965993642807, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1881 + }, + { + "epoch": 1.5031948881789137, + "grad_norm": 0.15633052587509155, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1882 + }, + { + "epoch": 1.5039936102236422, + "grad_norm": 0.3071416914463043, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1883 + }, + { + "epoch": 1.5047923322683707, + "grad_norm": 0.2126736044883728, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1884 + }, + { + "epoch": 1.505591054313099, + "grad_norm": 0.05252298340201378, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1885 + }, + { + "epoch": 1.5063897763578273, + "grad_norm": 0.23854316771030426, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1886 + }, + { + "epoch": 1.5071884984025559, + "grad_norm": 0.305148720741272, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1887 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 0.1371227502822876, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1888 + }, + { + "epoch": 1.508785942492013, + "grad_norm": 0.16433516144752502, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1889 + }, + { + "epoch": 1.5095846645367412, + "grad_norm": 0.24010877311229706, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1890 + }, + { + "epoch": 1.5103833865814695, + "grad_norm": 0.12839943170547485, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1891 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 0.055945366621017456, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1892 + }, + { + "epoch": 1.5119808306709266, + "grad_norm": 0.16645023226737976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1893 + }, + { + "epoch": 1.5127795527156551, + "grad_norm": 0.14626996219158173, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1894 + }, + { + "epoch": 1.5135782747603834, + "grad_norm": 0.04274629056453705, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1895 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 0.10497253388166428, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1896 + }, + { + "epoch": 1.5151757188498403, + "grad_norm": 0.159364715218544, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 1897 + }, + { + "epoch": 1.5159744408945688, + "grad_norm": 0.11409968137741089, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1898 + }, + { + "epoch": 1.516773162939297, + "grad_norm": 0.03989424183964729, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1899 + }, + { + "epoch": 1.5175718849840254, + "grad_norm": 0.12703374028205872, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1900 + }, + { + "epoch": 1.518370607028754, + "grad_norm": 0.20534875988960266, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1901 + }, + { + "epoch": 1.5191693290734825, + "grad_norm": 0.2276938110589981, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1902 + }, + { + "epoch": 1.519968051118211, + "grad_norm": 0.114278644323349, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1903 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 0.08295118063688278, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 1904 + }, + { + "epoch": 1.5215654952076676, + "grad_norm": 0.18610796332359314, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1905 + }, + { + "epoch": 1.5223642172523961, + "grad_norm": 0.1920524388551712, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1906 + }, + { + "epoch": 1.5231629392971247, + "grad_norm": 0.06447675824165344, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1907 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 0.17821159958839417, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1908 + }, + { + "epoch": 1.5247603833865815, + "grad_norm": 0.23894363641738892, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1909 + }, + { + "epoch": 1.5255591054313098, + "grad_norm": 0.14711391925811768, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1910 + }, + { + "epoch": 1.5263578274760383, + "grad_norm": 0.07863837480545044, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1911 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 0.20990678668022156, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 1912 + }, + { + "epoch": 1.5279552715654952, + "grad_norm": 0.19979886710643768, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1913 + }, + { + "epoch": 1.5287539936102237, + "grad_norm": 0.0871618464589119, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1914 + }, + { + "epoch": 1.529552715654952, + "grad_norm": 0.09294576942920685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1915 + }, + { + "epoch": 1.5303514376996805, + "grad_norm": 0.23010258376598358, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1916 + }, + { + "epoch": 1.531150159744409, + "grad_norm": 0.2919708788394928, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 1917 + }, + { + "epoch": 1.5319488817891374, + "grad_norm": 0.21767428517341614, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1918 + }, + { + "epoch": 1.5327476038338657, + "grad_norm": 0.07844182848930359, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1919 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 0.14891114830970764, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1920 + }, + { + "epoch": 1.5343450479233227, + "grad_norm": 0.17959977686405182, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1921 + }, + { + "epoch": 1.5351437699680512, + "grad_norm": 0.10217028856277466, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1922 + }, + { + "epoch": 1.5359424920127795, + "grad_norm": 0.08135818690061569, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1923 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 0.19660547375679016, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1924 + }, + { + "epoch": 1.5375399361022364, + "grad_norm": 0.2106354534626007, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1925 + }, + { + "epoch": 1.538338658146965, + "grad_norm": 0.11042182147502899, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1926 + }, + { + "epoch": 1.5391373801916934, + "grad_norm": 0.08777181059122086, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1927 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 0.18283812701702118, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1928 + }, + { + "epoch": 1.54073482428115, + "grad_norm": 0.11731691658496857, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1929 + }, + { + "epoch": 1.5415335463258786, + "grad_norm": 0.04163304716348648, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1930 + }, + { + "epoch": 1.542332268370607, + "grad_norm": 0.12119868397712708, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1931 + }, + { + "epoch": 1.5431309904153354, + "grad_norm": 0.18475785851478577, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1932 + }, + { + "epoch": 1.543929712460064, + "grad_norm": 0.16582897305488586, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1933 + }, + { + "epoch": 1.5447284345047922, + "grad_norm": 0.086383156478405, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1934 + }, + { + "epoch": 1.5455271565495208, + "grad_norm": 0.047143738716840744, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1935 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 0.0830119326710701, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1936 + }, + { + "epoch": 1.5471246006389776, + "grad_norm": 0.14226214587688446, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1937 + }, + { + "epoch": 1.547923322683706, + "grad_norm": 0.1719929724931717, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1938 + }, + { + "epoch": 1.5487220447284344, + "grad_norm": 0.18388192355632782, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1939 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 0.16870245337486267, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1940 + }, + { + "epoch": 1.5503194888178915, + "grad_norm": 0.1100412905216217, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1941 + }, + { + "epoch": 1.5511182108626198, + "grad_norm": 0.05124165490269661, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1942 + }, + { + "epoch": 1.5519169329073481, + "grad_norm": 0.08937443792819977, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1943 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 0.13589949905872345, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1944 + }, + { + "epoch": 1.5535143769968052, + "grad_norm": 0.12346407026052475, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1945 + }, + { + "epoch": 1.5543130990415337, + "grad_norm": 0.11836438626050949, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1946 + }, + { + "epoch": 1.555111821086262, + "grad_norm": 0.07569031417369843, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1947 + }, + { + "epoch": 1.5559105431309903, + "grad_norm": 0.039178211241960526, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1948 + }, + { + "epoch": 1.5567092651757188, + "grad_norm": 0.0431843139231205, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1949 + }, + { + "epoch": 1.5575079872204474, + "grad_norm": 0.06331207603216171, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1950 + }, + { + "epoch": 1.5583067092651757, + "grad_norm": 0.0670275092124939, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1951 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 0.04372883588075638, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1952 + }, + { + "epoch": 1.5599041533546325, + "grad_norm": 0.15768256783485413, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1953 + }, + { + "epoch": 1.560702875399361, + "grad_norm": 0.30828192830085754, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1954 + }, + { + "epoch": 1.5615015974440896, + "grad_norm": 0.3741140365600586, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1955 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 0.25689223408699036, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1956 + }, + { + "epoch": 1.5630990415335462, + "grad_norm": 0.0691552683711052, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1957 + }, + { + "epoch": 1.5638977635782747, + "grad_norm": 0.2742094099521637, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1958 + }, + { + "epoch": 1.5646964856230032, + "grad_norm": 0.2760325074195862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1959 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 0.09094057232141495, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1960 + }, + { + "epoch": 1.56629392971246, + "grad_norm": 0.11926092952489853, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1961 + }, + { + "epoch": 1.5670926517571884, + "grad_norm": 0.18398839235305786, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1962 + }, + { + "epoch": 1.567891373801917, + "grad_norm": 0.17090962827205658, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1963 + }, + { + "epoch": 1.5686900958466454, + "grad_norm": 0.07806222885847092, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1964 + }, + { + "epoch": 1.569488817891374, + "grad_norm": 0.17260140180587769, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1965 + }, + { + "epoch": 1.5702875399361023, + "grad_norm": 0.2848401665687561, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1966 + }, + { + "epoch": 1.5710862619808306, + "grad_norm": 0.19075879454612732, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1967 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 0.044234778732061386, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 1968 + }, + { + "epoch": 1.5726837060702876, + "grad_norm": 0.16188788414001465, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1969 + }, + { + "epoch": 1.573482428115016, + "grad_norm": 0.19148766994476318, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1970 + }, + { + "epoch": 1.5742811501597445, + "grad_norm": 0.11576604843139648, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1971 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 0.049716517329216, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1972 + }, + { + "epoch": 1.5758785942492013, + "grad_norm": 0.12528614699840546, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1973 + }, + { + "epoch": 1.5766773162939298, + "grad_norm": 0.1574268341064453, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1974 + }, + { + "epoch": 1.5774760383386581, + "grad_norm": 0.06606525182723999, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1975 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 0.16142094135284424, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1976 + }, + { + "epoch": 1.579073482428115, + "grad_norm": 0.29769718647003174, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1977 + }, + { + "epoch": 1.5798722044728435, + "grad_norm": 0.20111548900604248, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1978 + }, + { + "epoch": 1.580670926517572, + "grad_norm": 0.06375493854284286, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1979 + }, + { + "epoch": 1.5814696485623003, + "grad_norm": 0.2208068072795868, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1980 + }, + { + "epoch": 1.5822683706070286, + "grad_norm": 0.2920839488506317, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1981 + }, + { + "epoch": 1.5830670926517572, + "grad_norm": 0.2115958034992218, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1982 + }, + { + "epoch": 1.5838658146964857, + "grad_norm": 0.048249468207359314, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1983 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 0.15551301836967468, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1984 + }, + { + "epoch": 1.5854632587859425, + "grad_norm": 0.2190883755683899, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 1985 + }, + { + "epoch": 1.5862619808306708, + "grad_norm": 0.15155111253261566, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1986 + }, + { + "epoch": 1.5870607028753994, + "grad_norm": 0.056616391986608505, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1987 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 0.1638905555009842, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1988 + }, + { + "epoch": 1.5886581469648562, + "grad_norm": 0.11643283069133759, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1989 + }, + { + "epoch": 1.5894568690095847, + "grad_norm": 0.06423045694828033, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1990 + }, + { + "epoch": 1.590255591054313, + "grad_norm": 0.11044095456600189, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1991 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 0.11911707371473312, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1992 + }, + { + "epoch": 1.59185303514377, + "grad_norm": 0.045604925602674484, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1993 + }, + { + "epoch": 1.5926517571884984, + "grad_norm": 0.10280558466911316, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1994 + }, + { + "epoch": 1.5934504792332267, + "grad_norm": 0.13807371258735657, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1995 + }, + { + "epoch": 1.5942492012779552, + "grad_norm": 0.06163270026445389, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1996 + }, + { + "epoch": 1.5950479233226837, + "grad_norm": 0.12899963557720184, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1997 + }, + { + "epoch": 1.5958466453674123, + "grad_norm": 0.24358411133289337, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 1998 + }, + { + "epoch": 1.5966453674121406, + "grad_norm": 0.23341934382915497, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1999 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 0.11766334623098373, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2000 + }, + { + "epoch": 1.5982428115015974, + "grad_norm": 0.07918071746826172, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2001 + }, + { + "epoch": 1.599041533546326, + "grad_norm": 0.1473437398672104, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2002 + }, + { + "epoch": 1.5998402555910545, + "grad_norm": 0.08945708721876144, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2003 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.06553255021572113, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2004 + }, + { + "epoch": 1.601437699680511, + "grad_norm": 0.12708786129951477, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2005 + }, + { + "epoch": 1.6022364217252396, + "grad_norm": 0.16935905814170837, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2006 + }, + { + "epoch": 1.6030351437699681, + "grad_norm": 0.10428016632795334, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2007 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 0.06016766279935837, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 2008 + }, + { + "epoch": 1.604632587859425, + "grad_norm": 0.1563751995563507, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2009 + }, + { + "epoch": 1.6054313099041533, + "grad_norm": 0.1919829398393631, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2010 + }, + { + "epoch": 1.6062300319488818, + "grad_norm": 0.14739179611206055, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2011 + }, + { + "epoch": 1.6070287539936103, + "grad_norm": 0.08086550235748291, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2012 + }, + { + "epoch": 1.6078274760383386, + "grad_norm": 0.06594815105199814, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2013 + }, + { + "epoch": 1.608626198083067, + "grad_norm": 0.10502789169549942, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2014 + }, + { + "epoch": 1.6094249201277955, + "grad_norm": 0.1312190145254135, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2015 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 0.062411367893218994, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2016 + }, + { + "epoch": 1.6110223642172525, + "grad_norm": 0.04986036196351051, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2017 + }, + { + "epoch": 1.6118210862619808, + "grad_norm": 0.08428573608398438, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2018 + }, + { + "epoch": 1.6126198083067091, + "grad_norm": 0.11552372574806213, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2019 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 0.07657046616077423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2020 + }, + { + "epoch": 1.6142172523961662, + "grad_norm": 0.05540962517261505, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 2021 + }, + { + "epoch": 1.6150159744408947, + "grad_norm": 0.048573557287454605, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2022 + }, + { + "epoch": 1.615814696485623, + "grad_norm": 0.08630840480327606, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2023 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 0.06090754270553589, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2024 + }, + { + "epoch": 1.6174121405750799, + "grad_norm": 0.05828041955828667, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2025 + }, + { + "epoch": 1.6182108626198084, + "grad_norm": 0.12483426928520203, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2026 + }, + { + "epoch": 1.6190095846645367, + "grad_norm": 0.13772840797901154, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2027 + }, + { + "epoch": 1.619808306709265, + "grad_norm": 0.08477568626403809, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2028 + }, + { + "epoch": 1.6206070287539935, + "grad_norm": 0.037577688694000244, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2029 + }, + { + "epoch": 1.621405750798722, + "grad_norm": 0.07961893081665039, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2030 + }, + { + "epoch": 1.6222044728434506, + "grad_norm": 0.06744182854890823, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2031 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 0.06228869408369064, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2032 + }, + { + "epoch": 1.6238019169329072, + "grad_norm": 0.1972920298576355, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2033 + }, + { + "epoch": 1.6246006389776357, + "grad_norm": 0.2701529562473297, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2034 + }, + { + "epoch": 1.6253993610223643, + "grad_norm": 0.20371970534324646, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2035 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 0.08887646347284317, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2036 + }, + { + "epoch": 1.626996805111821, + "grad_norm": 0.06480003893375397, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2037 + }, + { + "epoch": 1.6277955271565494, + "grad_norm": 0.089780792593956, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2038 + }, + { + "epoch": 1.628594249201278, + "grad_norm": 0.04014933854341507, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2039 + }, + { + "epoch": 1.6293929712460065, + "grad_norm": 0.0993470847606659, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 2040 + }, + { + "epoch": 1.630191693290735, + "grad_norm": 0.1957429200410843, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2041 + }, + { + "epoch": 1.6309904153354633, + "grad_norm": 0.2273249477148056, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2042 + }, + { + "epoch": 1.6317891373801916, + "grad_norm": 0.1936638057231903, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2043 + }, + { + "epoch": 1.6325878594249201, + "grad_norm": 0.10150687396526337, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2044 + }, + { + "epoch": 1.6333865814696487, + "grad_norm": 0.051224563270807266, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2045 + }, + { + "epoch": 1.634185303514377, + "grad_norm": 0.13044138252735138, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2046 + }, + { + "epoch": 1.6349840255591053, + "grad_norm": 0.16140064597129822, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2047 + }, + { + "epoch": 1.6357827476038338, + "grad_norm": 0.13187173008918762, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2048 + }, + { + "epoch": 1.6365814696485623, + "grad_norm": 0.03873397782444954, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2049 + }, + { + "epoch": 1.6373801916932909, + "grad_norm": 0.0575883649289608, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2050 + }, + { + "epoch": 1.6381789137380192, + "grad_norm": 0.039476748555898666, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 2051 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 0.06802869588136673, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2052 + }, + { + "epoch": 1.639776357827476, + "grad_norm": 0.059946198016405106, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2053 + }, + { + "epoch": 1.6405750798722045, + "grad_norm": 0.05185665935277939, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2054 + }, + { + "epoch": 1.641373801916933, + "grad_norm": 0.08230192214250565, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2055 + }, + { + "epoch": 1.6421725239616614, + "grad_norm": 0.10175196081399918, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2056 + }, + { + "epoch": 1.6429712460063897, + "grad_norm": 0.07616171985864639, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2057 + }, + { + "epoch": 1.6437699680511182, + "grad_norm": 0.4597811698913574, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2058 + }, + { + "epoch": 1.6445686900958467, + "grad_norm": 0.12450811266899109, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2059 + }, + { + "epoch": 1.645367412140575, + "grad_norm": 0.10847678035497665, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2060 + }, + { + "epoch": 1.6461661341853036, + "grad_norm": 0.05778864026069641, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2061 + }, + { + "epoch": 1.6469648562300319, + "grad_norm": 0.04321129992604256, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2062 + }, + { + "epoch": 1.6477635782747604, + "grad_norm": 0.05467045307159424, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2063 + }, + { + "epoch": 1.648562300319489, + "grad_norm": 0.044298864901065826, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2064 + }, + { + "epoch": 1.6493610223642172, + "grad_norm": 0.03863062337040901, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2065 + }, + { + "epoch": 1.6501597444089455, + "grad_norm": 0.04040979593992233, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2066 + }, + { + "epoch": 1.650958466453674, + "grad_norm": 0.03647322207689285, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2067 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 0.049459293484687805, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2068 + }, + { + "epoch": 1.6525559105431311, + "grad_norm": 0.052851296961307526, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2069 + }, + { + "epoch": 1.6533546325878594, + "grad_norm": 0.10360822081565857, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2070 + }, + { + "epoch": 1.6541533546325877, + "grad_norm": 0.18817105889320374, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2071 + }, + { + "epoch": 1.6549520766773163, + "grad_norm": 0.1711605340242386, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2072 + }, + { + "epoch": 1.6557507987220448, + "grad_norm": 0.08807278424501419, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2073 + }, + { + "epoch": 1.6565495207667733, + "grad_norm": 0.0631125420331955, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2074 + }, + { + "epoch": 1.6573482428115016, + "grad_norm": 0.17277394235134125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2075 + }, + { + "epoch": 1.65814696485623, + "grad_norm": 0.2353454977273941, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2076 + }, + { + "epoch": 1.6589456869009584, + "grad_norm": 0.18835891783237457, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2077 + }, + { + "epoch": 1.659744408945687, + "grad_norm": 0.08717352151870728, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2078 + }, + { + "epoch": 1.6605431309904153, + "grad_norm": 0.05640486627817154, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2079 + }, + { + "epoch": 1.6613418530351438, + "grad_norm": 0.11206189543008804, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2080 + }, + { + "epoch": 1.6621405750798721, + "grad_norm": 0.10098055750131607, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2081 + }, + { + "epoch": 1.6629392971246006, + "grad_norm": 0.04627184569835663, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2082 + }, + { + "epoch": 1.6637380191693292, + "grad_norm": 0.13048212230205536, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2083 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 0.22329512238502502, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2084 + }, + { + "epoch": 1.6653354632587858, + "grad_norm": 0.23544666171073914, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2085 + }, + { + "epoch": 1.6661341853035143, + "grad_norm": 0.1329459846019745, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2086 + }, + { + "epoch": 1.6669329073482428, + "grad_norm": 0.07398947328329086, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2087 + }, + { + "epoch": 1.6677316293929714, + "grad_norm": 0.1926809549331665, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2088 + }, + { + "epoch": 1.6685303514376997, + "grad_norm": 0.19097647070884705, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2089 + }, + { + "epoch": 1.669329073482428, + "grad_norm": 0.10474745184183121, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2090 + }, + { + "epoch": 1.6701277955271565, + "grad_norm": 0.04437112435698509, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2091 + }, + { + "epoch": 1.670926517571885, + "grad_norm": 0.13698135316371918, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2092 + }, + { + "epoch": 1.6717252396166136, + "grad_norm": 0.14437462389469147, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2093 + }, + { + "epoch": 1.6725239616613419, + "grad_norm": 0.0938732922077179, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2094 + }, + { + "epoch": 1.6733226837060702, + "grad_norm": 0.060729511082172394, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2095 + }, + { + "epoch": 1.6741214057507987, + "grad_norm": 0.05354619398713112, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2096 + }, + { + "epoch": 1.6749201277955272, + "grad_norm": 0.056909799575805664, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2097 + }, + { + "epoch": 1.6757188498402555, + "grad_norm": 0.09815286099910736, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2098 + }, + { + "epoch": 1.676517571884984, + "grad_norm": 0.1432102620601654, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2099 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 0.14039601385593414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2100 + }, + { + "epoch": 1.678115015974441, + "grad_norm": 0.06634008139371872, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2101 + }, + { + "epoch": 1.6789137380191694, + "grad_norm": 0.1347021609544754, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2102 + }, + { + "epoch": 1.6797124600638977, + "grad_norm": 0.24721868336200714, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2103 + }, + { + "epoch": 1.680511182108626, + "grad_norm": 0.23194770514965057, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2104 + }, + { + "epoch": 1.6813099041533546, + "grad_norm": 0.12276436388492584, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2105 + }, + { + "epoch": 1.682108626198083, + "grad_norm": 0.06224825233221054, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2106 + }, + { + "epoch": 1.6829073482428116, + "grad_norm": 0.20683766901493073, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2107 + }, + { + "epoch": 1.68370607028754, + "grad_norm": 0.26914462447166443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2108 + }, + { + "epoch": 1.6845047923322682, + "grad_norm": 0.20070654153823853, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2109 + }, + { + "epoch": 1.6853035143769968, + "grad_norm": 0.08465532958507538, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2110 + }, + { + "epoch": 1.6861022364217253, + "grad_norm": 0.10843367129564285, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2111 + }, + { + "epoch": 1.6869009584664538, + "grad_norm": 0.20252646505832672, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2112 + }, + { + "epoch": 1.6876996805111821, + "grad_norm": 0.11803672462701797, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2113 + }, + { + "epoch": 1.6884984025559104, + "grad_norm": 0.08800901472568512, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2114 + }, + { + "epoch": 1.689297124600639, + "grad_norm": 0.23917800188064575, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2115 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 0.21528035402297974, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2116 + }, + { + "epoch": 1.6908945686900958, + "grad_norm": 0.05292942747473717, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2117 + }, + { + "epoch": 1.6916932907348243, + "grad_norm": 0.12942583858966827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2118 + }, + { + "epoch": 1.6924920127795526, + "grad_norm": 0.19304881989955902, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2119 + }, + { + "epoch": 1.6932907348242812, + "grad_norm": 0.10951094329357147, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 2120 + }, + { + "epoch": 1.6940894568690097, + "grad_norm": 0.07684643566608429, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2121 + }, + { + "epoch": 1.694888178913738, + "grad_norm": 0.14990608394145966, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2122 + }, + { + "epoch": 1.6956869009584663, + "grad_norm": 0.1104716882109642, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2123 + }, + { + "epoch": 1.6964856230031948, + "grad_norm": 0.06538088619709015, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2124 + }, + { + "epoch": 1.6972843450479234, + "grad_norm": 0.05474448576569557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2125 + }, + { + "epoch": 1.6980830670926519, + "grad_norm": 0.0803864449262619, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2126 + }, + { + "epoch": 1.6988817891373802, + "grad_norm": 0.04384651407599449, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2127 + }, + { + "epoch": 1.6996805111821085, + "grad_norm": 0.07006746530532837, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 2128 + }, + { + "epoch": 1.700479233226837, + "grad_norm": 0.08840122073888779, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2129 + }, + { + "epoch": 1.7012779552715656, + "grad_norm": 0.06421404331922531, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2130 + }, + { + "epoch": 1.702076677316294, + "grad_norm": 0.03711751103401184, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2131 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 0.06725160032510757, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2132 + }, + { + "epoch": 1.7036741214057507, + "grad_norm": 0.0517839640378952, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2133 + }, + { + "epoch": 1.7044728434504792, + "grad_norm": 0.046399205923080444, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2134 + }, + { + "epoch": 1.7052715654952078, + "grad_norm": 0.05188435688614845, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2135 + }, + { + "epoch": 1.706070287539936, + "grad_norm": 0.08578629791736603, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2136 + }, + { + "epoch": 1.7068690095846646, + "grad_norm": 0.07895999401807785, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2137 + }, + { + "epoch": 1.707667731629393, + "grad_norm": 0.060662928968667984, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2138 + }, + { + "epoch": 1.7084664536741214, + "grad_norm": 0.08372191339731216, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2139 + }, + { + "epoch": 1.70926517571885, + "grad_norm": 0.1217966303229332, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2140 + }, + { + "epoch": 1.7100638977635783, + "grad_norm": 0.14054186642169952, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2141 + }, + { + "epoch": 1.7108626198083066, + "grad_norm": 0.11693520098924637, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2142 + }, + { + "epoch": 1.711661341853035, + "grad_norm": 0.04271163418889046, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2143 + }, + { + "epoch": 1.7124600638977636, + "grad_norm": 0.11898874491453171, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2144 + }, + { + "epoch": 1.7132587859424921, + "grad_norm": 0.2637499272823334, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2145 + }, + { + "epoch": 1.7140575079872205, + "grad_norm": 0.29218390583992004, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2146 + }, + { + "epoch": 1.7148562300319488, + "grad_norm": 0.1899375170469284, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2147 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 0.04336607828736305, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2148 + }, + { + "epoch": 1.7164536741214058, + "grad_norm": 0.14123578369617462, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2149 + }, + { + "epoch": 1.7172523961661343, + "grad_norm": 0.19930055737495422, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2150 + }, + { + "epoch": 1.7180511182108626, + "grad_norm": 0.1796298772096634, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2151 + }, + { + "epoch": 1.718849840255591, + "grad_norm": 0.07607068121433258, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2152 + }, + { + "epoch": 1.7196485623003195, + "grad_norm": 0.12980210781097412, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2153 + }, + { + "epoch": 1.720447284345048, + "grad_norm": 0.2507205009460449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2154 + }, + { + "epoch": 1.7212460063897763, + "grad_norm": 0.2388920783996582, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2155 + }, + { + "epoch": 1.7220447284345048, + "grad_norm": 0.13363847136497498, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 1.7228434504792332, + "grad_norm": 0.048030026257038116, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 2157 + }, + { + "epoch": 1.7236421725239617, + "grad_norm": 0.14619708061218262, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2158 + }, + { + "epoch": 1.7244408945686902, + "grad_norm": 0.22031216323375702, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2159 + }, + { + "epoch": 1.7252396166134185, + "grad_norm": 0.18440701067447662, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2160 + }, + { + "epoch": 1.7260383386581468, + "grad_norm": 0.08183866739273071, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2161 + }, + { + "epoch": 1.7268370607028753, + "grad_norm": 0.05314984545111656, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2162 + }, + { + "epoch": 1.7276357827476039, + "grad_norm": 0.1438753753900528, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2163 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 0.0881122425198555, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 2164 + }, + { + "epoch": 1.7292332268370607, + "grad_norm": 0.1165589690208435, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2165 + }, + { + "epoch": 1.730031948881789, + "grad_norm": 0.14884884655475616, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2166 + }, + { + "epoch": 1.7308306709265175, + "grad_norm": 0.10219287127256393, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2167 + }, + { + "epoch": 1.731629392971246, + "grad_norm": 0.059794824570417404, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2168 + }, + { + "epoch": 1.7324281150159746, + "grad_norm": 0.0538945347070694, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2169 + }, + { + "epoch": 1.733226837060703, + "grad_norm": 0.1016303226351738, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2170 + }, + { + "epoch": 1.7340255591054312, + "grad_norm": 0.058912694454193115, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2171 + }, + { + "epoch": 1.7348242811501597, + "grad_norm": 0.060018621385097504, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2172 + }, + { + "epoch": 1.7356230031948883, + "grad_norm": 0.05386706069111824, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2173 + }, + { + "epoch": 1.7364217252396166, + "grad_norm": 0.06266453117132187, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2174 + }, + { + "epoch": 1.7372204472843449, + "grad_norm": 0.1035243570804596, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 2175 + }, + { + "epoch": 1.7380191693290734, + "grad_norm": 0.17216888070106506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2176 + }, + { + "epoch": 1.738817891373802, + "grad_norm": 0.23428532481193542, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2177 + }, + { + "epoch": 1.7396166134185305, + "grad_norm": 0.21038073301315308, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2178 + }, + { + "epoch": 1.7404153354632588, + "grad_norm": 0.1487000286579132, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2179 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 0.03916196525096893, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2180 + }, + { + "epoch": 1.7420127795527156, + "grad_norm": 0.13702991604804993, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2181 + }, + { + "epoch": 1.7428115015974441, + "grad_norm": 0.21363528072834015, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2182 + }, + { + "epoch": 1.7436102236421727, + "grad_norm": 0.134271502494812, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2183 + }, + { + "epoch": 1.744408945686901, + "grad_norm": 0.062452565878629684, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2184 + }, + { + "epoch": 1.7452076677316293, + "grad_norm": 0.1745995730161667, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2185 + }, + { + "epoch": 1.7460063897763578, + "grad_norm": 0.19709894061088562, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2186 + }, + { + "epoch": 1.7468051118210863, + "grad_norm": 0.1201571598649025, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2187 + }, + { + "epoch": 1.7476038338658149, + "grad_norm": 0.03690087050199509, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2188 + }, + { + "epoch": 1.7484025559105432, + "grad_norm": 0.1387440711259842, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2189 + }, + { + "epoch": 1.7492012779552715, + "grad_norm": 0.2084781676530838, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2190 + }, + { + "epoch": 1.75, + "grad_norm": 0.17941167950630188, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2191 + }, + { + "epoch": 1.7507987220447285, + "grad_norm": 0.09751889854669571, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2192 + }, + { + "epoch": 1.7515974440894568, + "grad_norm": 0.04116421565413475, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2193 + }, + { + "epoch": 1.7523961661341851, + "grad_norm": 0.14683429896831512, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2194 + }, + { + "epoch": 1.7531948881789137, + "grad_norm": 0.19602352380752563, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2195 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 0.18503598868846893, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2196 + }, + { + "epoch": 1.7547923322683707, + "grad_norm": 0.09473808109760284, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2197 + }, + { + "epoch": 1.755591054313099, + "grad_norm": 0.05645129457116127, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2198 + }, + { + "epoch": 1.7563897763578273, + "grad_norm": 0.09260818362236023, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2199 + }, + { + "epoch": 1.7571884984025559, + "grad_norm": 0.045891985297203064, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2200 + }, + { + "epoch": 1.7579872204472844, + "grad_norm": 0.125623419880867, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2201 + }, + { + "epoch": 1.758785942492013, + "grad_norm": 0.18919512629508972, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2202 + }, + { + "epoch": 1.7595846645367412, + "grad_norm": 0.17549264430999756, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2203 + }, + { + "epoch": 1.7603833865814695, + "grad_norm": 0.047342319041490555, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 2204 + }, + { + "epoch": 1.761182108626198, + "grad_norm": 0.177268847823143, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2205 + }, + { + "epoch": 1.7619808306709266, + "grad_norm": 0.28258222341537476, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2206 + }, + { + "epoch": 1.7627795527156551, + "grad_norm": 0.25111353397369385, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2207 + }, + { + "epoch": 1.7635782747603834, + "grad_norm": 0.11864849925041199, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2208 + }, + { + "epoch": 1.7643769968051117, + "grad_norm": 0.06387785822153091, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2209 + }, + { + "epoch": 1.7651757188498403, + "grad_norm": 0.1264238804578781, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2210 + }, + { + "epoch": 1.7659744408945688, + "grad_norm": 0.12080882489681244, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2211 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.05618004873394966, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2212 + }, + { + "epoch": 1.7675718849840254, + "grad_norm": 0.06543037295341492, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2213 + }, + { + "epoch": 1.768370607028754, + "grad_norm": 0.08525256812572479, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2214 + }, + { + "epoch": 1.7691693290734825, + "grad_norm": 0.08571972697973251, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2215 + }, + { + "epoch": 1.769968051118211, + "grad_norm": 0.04897582530975342, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2216 + }, + { + "epoch": 1.7707667731629393, + "grad_norm": 0.07296427339315414, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2217 + }, + { + "epoch": 1.7715654952076676, + "grad_norm": 0.041904110461473465, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2218 + }, + { + "epoch": 1.7723642172523961, + "grad_norm": 0.053191233426332474, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2219 + }, + { + "epoch": 1.7731629392971247, + "grad_norm": 0.056369587779045105, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2220 + }, + { + "epoch": 1.7739616613418532, + "grad_norm": 0.06455157697200775, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2221 + }, + { + "epoch": 1.7747603833865815, + "grad_norm": 0.06467561423778534, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2222 + }, + { + "epoch": 1.7755591054313098, + "grad_norm": 0.07162238657474518, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2223 + }, + { + "epoch": 1.7763578274760383, + "grad_norm": 0.045193906873464584, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2224 + }, + { + "epoch": 1.7771565495207668, + "grad_norm": 0.07172992080450058, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2225 + }, + { + "epoch": 1.7779552715654952, + "grad_norm": 0.07163143157958984, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2226 + }, + { + "epoch": 1.7787539936102237, + "grad_norm": 0.11480346322059631, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2227 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 0.21525998413562775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 2228 + }, + { + "epoch": 1.7803514376996805, + "grad_norm": 0.20769886672496796, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2229 + }, + { + "epoch": 1.781150159744409, + "grad_norm": 0.13149204850196838, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2230 + }, + { + "epoch": 1.7819488817891374, + "grad_norm": 0.06223989278078079, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2231 + }, + { + "epoch": 1.7827476038338657, + "grad_norm": 0.11386150866746902, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2232 + }, + { + "epoch": 1.7835463258785942, + "grad_norm": 0.1448865532875061, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2233 + }, + { + "epoch": 1.7843450479233227, + "grad_norm": 0.11244893074035645, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2234 + }, + { + "epoch": 1.7851437699680512, + "grad_norm": 0.06307587027549744, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2235 + }, + { + "epoch": 1.7859424920127795, + "grad_norm": 0.1529018133878708, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2236 + }, + { + "epoch": 1.7867412140575079, + "grad_norm": 0.212649405002594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2237 + }, + { + "epoch": 1.7875399361022364, + "grad_norm": 0.18361856043338776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2238 + }, + { + "epoch": 1.788338658146965, + "grad_norm": 0.06960433721542358, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2239 + }, + { + "epoch": 1.7891373801916934, + "grad_norm": 0.13445821404457092, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2240 + }, + { + "epoch": 1.7899361022364217, + "grad_norm": 0.24758578836917877, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2241 + }, + { + "epoch": 1.79073482428115, + "grad_norm": 0.27208608388900757, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2242 + }, + { + "epoch": 1.7915335463258786, + "grad_norm": 0.1256505697965622, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2243 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 0.12209334224462509, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2244 + }, + { + "epoch": 1.7931309904153354, + "grad_norm": 0.2690032720565796, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2245 + }, + { + "epoch": 1.793929712460064, + "grad_norm": 0.27393221855163574, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2246 + }, + { + "epoch": 1.7947284345047922, + "grad_norm": 0.12508991360664368, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 2247 + }, + { + "epoch": 1.7955271565495208, + "grad_norm": 0.10001108795404434, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2248 + }, + { + "epoch": 1.7963258785942493, + "grad_norm": 0.2588697373867035, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2249 + }, + { + "epoch": 1.7971246006389776, + "grad_norm": 0.24723860621452332, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2250 + }, + { + "epoch": 1.797923322683706, + "grad_norm": 0.09018664062023163, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2251 + }, + { + "epoch": 1.7987220447284344, + "grad_norm": 0.09745316952466965, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 2252 + }, + { + "epoch": 1.799520766773163, + "grad_norm": 0.20877481997013092, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 2253 + }, + { + "epoch": 1.8003194888178915, + "grad_norm": 0.24291004240512848, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2254 + }, + { + "epoch": 1.8011182108626198, + "grad_norm": 0.1967754364013672, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2255 + }, + { + "epoch": 1.8019169329073481, + "grad_norm": 0.088215172290802, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2256 + }, + { + "epoch": 1.8027156549520766, + "grad_norm": 0.07018816471099854, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2257 + }, + { + "epoch": 1.8035143769968052, + "grad_norm": 0.17161858081817627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2258 + }, + { + "epoch": 1.8043130990415337, + "grad_norm": 0.22007174789905548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2259 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 0.16093726456165314, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2260 + }, + { + "epoch": 1.8059105431309903, + "grad_norm": 0.06763539463281631, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2261 + }, + { + "epoch": 1.8067092651757188, + "grad_norm": 0.1066257432103157, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2262 + }, + { + "epoch": 1.8075079872204474, + "grad_norm": 0.17658250033855438, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2263 + }, + { + "epoch": 1.8083067092651757, + "grad_norm": 0.21157506108283997, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2264 + }, + { + "epoch": 1.8091054313099042, + "grad_norm": 0.16717523336410522, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2265 + }, + { + "epoch": 1.8099041533546325, + "grad_norm": 0.08356527984142303, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2266 + }, + { + "epoch": 1.810702875399361, + "grad_norm": 0.11939100921154022, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2267 + }, + { + "epoch": 1.8115015974440896, + "grad_norm": 0.2322039157152176, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2268 + }, + { + "epoch": 1.8123003194888179, + "grad_norm": 0.2277170568704605, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2269 + }, + { + "epoch": 1.8130990415335462, + "grad_norm": 0.06634530425071716, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2270 + }, + { + "epoch": 1.8138977635782747, + "grad_norm": 0.20808424055576324, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2271 + }, + { + "epoch": 1.8146964856230032, + "grad_norm": 0.3761717975139618, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2272 + }, + { + "epoch": 1.8154952076677318, + "grad_norm": 0.3587193191051483, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2273 + }, + { + "epoch": 1.81629392971246, + "grad_norm": 0.12116564810276031, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2274 + }, + { + "epoch": 1.8170926517571884, + "grad_norm": 0.20137764513492584, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2275 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 0.30456987023353577, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2276 + }, + { + "epoch": 1.8186900958466454, + "grad_norm": 0.15625369548797607, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2277 + }, + { + "epoch": 1.819488817891374, + "grad_norm": 0.12682494521141052, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2278 + }, + { + "epoch": 1.8202875399361023, + "grad_norm": 0.26252153515815735, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2279 + }, + { + "epoch": 1.8210862619808306, + "grad_norm": 0.17610949277877808, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2280 + }, + { + "epoch": 1.821884984025559, + "grad_norm": 0.056205663830041885, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2281 + }, + { + "epoch": 1.8226837060702876, + "grad_norm": 0.1519095003604889, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2282 + }, + { + "epoch": 1.823482428115016, + "grad_norm": 0.1591203212738037, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2283 + }, + { + "epoch": 1.8242811501597445, + "grad_norm": 0.11261039227247238, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2284 + }, + { + "epoch": 1.8250798722044728, + "grad_norm": 0.06855058670043945, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2285 + }, + { + "epoch": 1.8258785942492013, + "grad_norm": 0.04728224128484726, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2286 + }, + { + "epoch": 1.8266773162939298, + "grad_norm": 0.0677042305469513, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2287 + }, + { + "epoch": 1.8274760383386581, + "grad_norm": 0.0836048573255539, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2288 + }, + { + "epoch": 1.8282747603833864, + "grad_norm": 0.0657985508441925, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2289 + }, + { + "epoch": 1.829073482428115, + "grad_norm": 0.05567999184131622, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2290 + }, + { + "epoch": 1.8298722044728435, + "grad_norm": 0.13710817694664001, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2291 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 0.14417411386966705, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2292 + }, + { + "epoch": 1.8314696485623003, + "grad_norm": 0.12273317575454712, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2293 + }, + { + "epoch": 1.8322683706070286, + "grad_norm": 0.12350328266620636, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2294 + }, + { + "epoch": 1.8330670926517572, + "grad_norm": 0.12832887470722198, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2295 + }, + { + "epoch": 1.8338658146964857, + "grad_norm": 0.17759868502616882, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2296 + }, + { + "epoch": 1.8346645367412142, + "grad_norm": 0.18485887348651886, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2297 + }, + { + "epoch": 1.8354632587859425, + "grad_norm": 0.11906488239765167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2298 + }, + { + "epoch": 1.8362619808306708, + "grad_norm": 0.04088319092988968, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2299 + }, + { + "epoch": 1.8370607028753994, + "grad_norm": 0.18988807499408722, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2300 + }, + { + "epoch": 1.8378594249201279, + "grad_norm": 0.2758033275604248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2301 + }, + { + "epoch": 1.8386581469648562, + "grad_norm": 0.26860401034355164, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2302 + }, + { + "epoch": 1.8394568690095847, + "grad_norm": 0.1770019680261612, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2303 + }, + { + "epoch": 1.840255591054313, + "grad_norm": 0.03740993142127991, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2304 + }, + { + "epoch": 1.8410543130990416, + "grad_norm": 0.13697518408298492, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2305 + }, + { + "epoch": 1.84185303514377, + "grad_norm": 0.15273790061473846, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2306 + }, + { + "epoch": 1.8426517571884984, + "grad_norm": 0.08181154727935791, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2307 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.05599624291062355, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2308 + }, + { + "epoch": 1.8442492012779552, + "grad_norm": 0.17429251968860626, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2309 + }, + { + "epoch": 1.8450479233226837, + "grad_norm": 0.20159491896629333, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2310 + }, + { + "epoch": 1.8458466453674123, + "grad_norm": 0.10825419425964355, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2311 + }, + { + "epoch": 1.8466453674121406, + "grad_norm": 0.0784185528755188, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2312 + }, + { + "epoch": 1.8474440894568689, + "grad_norm": 0.15851987898349762, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2313 + }, + { + "epoch": 1.8482428115015974, + "grad_norm": 0.11244971305131912, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2314 + }, + { + "epoch": 1.849041533546326, + "grad_norm": 0.04119047150015831, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2315 + }, + { + "epoch": 1.8498402555910545, + "grad_norm": 0.12872102856636047, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2316 + }, + { + "epoch": 1.8506389776357828, + "grad_norm": 0.1542259305715561, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2317 + }, + { + "epoch": 1.851437699680511, + "grad_norm": 0.09662868827581406, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2318 + }, + { + "epoch": 1.8522364217252396, + "grad_norm": 0.04452383890748024, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2319 + }, + { + "epoch": 1.8530351437699681, + "grad_norm": 0.03368959203362465, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2320 + }, + { + "epoch": 1.8538338658146964, + "grad_norm": 0.05867767333984375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2321 + }, + { + "epoch": 1.854632587859425, + "grad_norm": 0.0774846225976944, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2322 + }, + { + "epoch": 1.8554313099041533, + "grad_norm": 0.05172058939933777, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2323 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 0.06597824394702911, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2324 + }, + { + "epoch": 1.8570287539936103, + "grad_norm": 0.10818778723478317, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2325 + }, + { + "epoch": 1.8578274760383386, + "grad_norm": 0.12698976695537567, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2326 + }, + { + "epoch": 1.858626198083067, + "grad_norm": 0.06547659635543823, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2327 + }, + { + "epoch": 1.8594249201277955, + "grad_norm": 0.08613643050193787, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2328 + }, + { + "epoch": 1.860223642172524, + "grad_norm": 0.23452800512313843, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2329 + }, + { + "epoch": 1.8610223642172525, + "grad_norm": 0.29293227195739746, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2330 + }, + { + "epoch": 1.8618210862619808, + "grad_norm": 0.17590634524822235, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2331 + }, + { + "epoch": 1.8626198083067091, + "grad_norm": 0.09830035269260406, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2332 + }, + { + "epoch": 1.8634185303514377, + "grad_norm": 0.2336016595363617, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2333 + }, + { + "epoch": 1.8642172523961662, + "grad_norm": 0.22990736365318298, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2334 + }, + { + "epoch": 1.8650159744408947, + "grad_norm": 0.14177313446998596, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2335 + }, + { + "epoch": 1.865814696485623, + "grad_norm": 0.07447824627161026, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2336 + }, + { + "epoch": 1.8666134185303513, + "grad_norm": 0.20551882684230804, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2337 + }, + { + "epoch": 1.8674121405750799, + "grad_norm": 0.21193428337574005, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2338 + }, + { + "epoch": 1.8682108626198084, + "grad_norm": 0.09889520704746246, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2339 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 0.06506047397851944, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2340 + }, + { + "epoch": 1.869808306709265, + "grad_norm": 0.10613662004470825, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2341 + }, + { + "epoch": 1.8706070287539935, + "grad_norm": 0.13049691915512085, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2342 + }, + { + "epoch": 1.871405750798722, + "grad_norm": 0.07257628440856934, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2343 + }, + { + "epoch": 1.8722044728434506, + "grad_norm": 0.05402761325240135, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2344 + }, + { + "epoch": 1.873003194888179, + "grad_norm": 0.1298513114452362, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2345 + }, + { + "epoch": 1.8738019169329072, + "grad_norm": 0.18854250013828278, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2346 + }, + { + "epoch": 1.8746006389776357, + "grad_norm": 0.18749283254146576, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2347 + }, + { + "epoch": 1.8753993610223643, + "grad_norm": 0.0791897177696228, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2348 + }, + { + "epoch": 1.8761980830670928, + "grad_norm": 0.061554014682769775, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2349 + }, + { + "epoch": 1.876996805111821, + "grad_norm": 0.07776489108800888, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2350 + }, + { + "epoch": 1.8777955271565494, + "grad_norm": 0.06406589597463608, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2351 + }, + { + "epoch": 1.878594249201278, + "grad_norm": 0.04364178702235222, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2352 + }, + { + "epoch": 1.8793929712460065, + "grad_norm": 0.14296351373195648, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2353 + }, + { + "epoch": 1.880191693290735, + "grad_norm": 0.23554368317127228, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2354 + }, + { + "epoch": 1.8809904153354633, + "grad_norm": 0.17022013664245605, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 2355 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 0.055340252816677094, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2356 + }, + { + "epoch": 1.8825878594249201, + "grad_norm": 0.10552496463060379, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2357 + }, + { + "epoch": 1.8833865814696487, + "grad_norm": 0.1601826697587967, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2358 + }, + { + "epoch": 1.884185303514377, + "grad_norm": 0.15029270946979523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2359 + }, + { + "epoch": 1.8849840255591053, + "grad_norm": 0.05186127871274948, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2360 + }, + { + "epoch": 1.8857827476038338, + "grad_norm": 0.10678224265575409, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2361 + }, + { + "epoch": 1.8865814696485623, + "grad_norm": 0.1380450427532196, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2362 + }, + { + "epoch": 1.8873801916932909, + "grad_norm": 0.08721969276666641, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2363 + }, + { + "epoch": 1.8881789137380192, + "grad_norm": 0.09425338357686996, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2364 + }, + { + "epoch": 1.8889776357827475, + "grad_norm": 0.16815589368343353, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2365 + }, + { + "epoch": 1.889776357827476, + "grad_norm": 0.16181580722332, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2366 + }, + { + "epoch": 1.8905750798722045, + "grad_norm": 0.054028045386075974, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2367 + }, + { + "epoch": 1.891373801916933, + "grad_norm": 0.07199764251708984, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2368 + }, + { + "epoch": 1.8921725239616614, + "grad_norm": 0.08493109047412872, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2369 + }, + { + "epoch": 1.8929712460063897, + "grad_norm": 0.09665308892726898, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 2370 + }, + { + "epoch": 1.8937699680511182, + "grad_norm": 0.07975895702838898, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2371 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 0.06089888513088226, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2372 + }, + { + "epoch": 1.895367412140575, + "grad_norm": 0.04610683396458626, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2373 + }, + { + "epoch": 1.8961661341853036, + "grad_norm": 0.06083180755376816, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2374 + }, + { + "epoch": 1.8969648562300319, + "grad_norm": 0.07177560776472092, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 2375 + }, + { + "epoch": 1.8977635782747604, + "grad_norm": 0.04214467853307724, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2376 + }, + { + "epoch": 1.898562300319489, + "grad_norm": 0.05166957527399063, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2377 + }, + { + "epoch": 1.8993610223642172, + "grad_norm": 0.040181614458560944, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2378 + }, + { + "epoch": 1.9001597444089455, + "grad_norm": 0.043485358357429504, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2379 + }, + { + "epoch": 1.900958466453674, + "grad_norm": 0.07395761460065842, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2380 + }, + { + "epoch": 1.9017571884984026, + "grad_norm": 0.05133877694606781, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 2381 + }, + { + "epoch": 1.9025559105431311, + "grad_norm": 0.059279292821884155, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2382 + }, + { + "epoch": 1.9033546325878594, + "grad_norm": 0.07573487609624863, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2383 + }, + { + "epoch": 1.9041533546325877, + "grad_norm": 0.07013942301273346, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2384 + }, + { + "epoch": 1.9049520766773163, + "grad_norm": 0.14524684846401215, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2385 + }, + { + "epoch": 1.9057507987220448, + "grad_norm": 0.17374426126480103, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2386 + }, + { + "epoch": 1.9065495207667733, + "grad_norm": 0.1387263685464859, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2387 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 0.045813702046871185, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2388 + }, + { + "epoch": 1.90814696485623, + "grad_norm": 0.189321830868721, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2389 + }, + { + "epoch": 1.9089456869009584, + "grad_norm": 0.261329710483551, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2390 + }, + { + "epoch": 1.909744408945687, + "grad_norm": 0.1599399596452713, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2391 + }, + { + "epoch": 1.9105431309904153, + "grad_norm": 0.03977127745747566, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2392 + }, + { + "epoch": 1.9113418530351438, + "grad_norm": 0.16269442439079285, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2393 + }, + { + "epoch": 1.9121405750798721, + "grad_norm": 0.22963251173496246, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2394 + }, + { + "epoch": 1.9129392971246006, + "grad_norm": 0.1526031792163849, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2395 + }, + { + "epoch": 1.9137380191693292, + "grad_norm": 0.07236737757921219, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 2396 + }, + { + "epoch": 1.9145367412140575, + "grad_norm": 0.19993482530117035, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2397 + }, + { + "epoch": 1.9153354632587858, + "grad_norm": 0.18950621783733368, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2398 + }, + { + "epoch": 1.9161341853035143, + "grad_norm": 0.10046153515577316, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2399 + }, + { + "epoch": 1.9169329073482428, + "grad_norm": 0.07884453237056732, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2400 + }, + { + "epoch": 1.9177316293929714, + "grad_norm": 0.23947227001190186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2401 + }, + { + "epoch": 1.9185303514376997, + "grad_norm": 0.2662964165210724, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2402 + }, + { + "epoch": 1.919329073482428, + "grad_norm": 0.1257917582988739, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2403 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 0.09092582017183304, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2404 + }, + { + "epoch": 1.920926517571885, + "grad_norm": 0.19677215814590454, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2405 + }, + { + "epoch": 1.9217252396166136, + "grad_norm": 0.17972320318222046, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2406 + }, + { + "epoch": 1.9225239616613419, + "grad_norm": 0.06155665963888168, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2407 + }, + { + "epoch": 1.9233226837060702, + "grad_norm": 0.14805591106414795, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2408 + }, + { + "epoch": 1.9241214057507987, + "grad_norm": 0.2414662092924118, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2409 + }, + { + "epoch": 1.9249201277955272, + "grad_norm": 0.2084181308746338, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2410 + }, + { + "epoch": 1.9257188498402555, + "grad_norm": 0.05523146688938141, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2411 + }, + { + "epoch": 1.926517571884984, + "grad_norm": 0.13994552195072174, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2412 + }, + { + "epoch": 1.9273162939297124, + "grad_norm": 0.2648966312408447, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2413 + }, + { + "epoch": 1.928115015974441, + "grad_norm": 0.28959497809410095, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2414 + }, + { + "epoch": 1.9289137380191694, + "grad_norm": 0.11457488685846329, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2415 + }, + { + "epoch": 1.9297124600638977, + "grad_norm": 0.12448041886091232, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2416 + }, + { + "epoch": 1.930511182108626, + "grad_norm": 0.20807982981204987, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2417 + }, + { + "epoch": 1.9313099041533546, + "grad_norm": 0.14537623524665833, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2418 + }, + { + "epoch": 1.932108626198083, + "grad_norm": 0.0428709015250206, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2419 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.07923824340105057, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2420 + }, + { + "epoch": 1.93370607028754, + "grad_norm": 0.06046072393655777, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2421 + }, + { + "epoch": 1.9345047923322682, + "grad_norm": 0.05921380594372749, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2422 + }, + { + "epoch": 1.9353035143769968, + "grad_norm": 0.05324951559305191, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2423 + }, + { + "epoch": 1.9361022364217253, + "grad_norm": 0.060725487768650055, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2424 + }, + { + "epoch": 1.9369009584664538, + "grad_norm": 0.09305386245250702, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2425 + }, + { + "epoch": 1.9376996805111821, + "grad_norm": 0.12314888834953308, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2426 + }, + { + "epoch": 1.9384984025559104, + "grad_norm": 0.08590805530548096, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2427 + }, + { + "epoch": 1.939297124600639, + "grad_norm": 0.07134587317705154, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2428 + }, + { + "epoch": 1.9400958466453675, + "grad_norm": 0.04584966599941254, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2429 + }, + { + "epoch": 1.9408945686900958, + "grad_norm": 0.050389841198921204, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2430 + }, + { + "epoch": 1.9416932907348243, + "grad_norm": 0.055894333869218826, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2431 + }, + { + "epoch": 1.9424920127795526, + "grad_norm": 0.05231403559446335, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2432 + }, + { + "epoch": 1.9432907348242812, + "grad_norm": 0.04235154017806053, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2433 + }, + { + "epoch": 1.9440894568690097, + "grad_norm": 0.038994334638118744, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2434 + }, + { + "epoch": 1.944888178913738, + "grad_norm": 0.062291134148836136, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2435 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.10267619043588638, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2436 + }, + { + "epoch": 1.9464856230031948, + "grad_norm": 0.12227646261453629, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2437 + }, + { + "epoch": 1.9472843450479234, + "grad_norm": 0.07677904516458511, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2438 + }, + { + "epoch": 1.9480830670926519, + "grad_norm": 0.043213456869125366, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2439 + }, + { + "epoch": 1.9488817891373802, + "grad_norm": 0.0464320071041584, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2440 + }, + { + "epoch": 1.9496805111821085, + "grad_norm": 0.0488814078271389, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2441 + }, + { + "epoch": 1.950479233226837, + "grad_norm": 0.07102649658918381, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2442 + }, + { + "epoch": 1.9512779552715656, + "grad_norm": 0.056355372071266174, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2443 + }, + { + "epoch": 1.952076677316294, + "grad_norm": 0.05412770435214043, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2444 + }, + { + "epoch": 1.9528753993610224, + "grad_norm": 0.05533284693956375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2445 + }, + { + "epoch": 1.9536741214057507, + "grad_norm": 0.07065420597791672, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2446 + }, + { + "epoch": 1.9544728434504792, + "grad_norm": 0.0424923375248909, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2447 + }, + { + "epoch": 1.9552715654952078, + "grad_norm": 0.07682394236326218, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2448 + }, + { + "epoch": 1.956070287539936, + "grad_norm": 0.12305673956871033, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2449 + }, + { + "epoch": 1.9568690095846646, + "grad_norm": 0.12699945271015167, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2450 + }, + { + "epoch": 1.957667731629393, + "grad_norm": 0.09973076730966568, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2451 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 0.04687270149588585, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2452 + }, + { + "epoch": 1.95926517571885, + "grad_norm": 0.16843228042125702, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2453 + }, + { + "epoch": 1.9600638977635783, + "grad_norm": 0.27191975712776184, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2454 + }, + { + "epoch": 1.9608626198083066, + "grad_norm": 0.2563989460468292, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2455 + }, + { + "epoch": 1.961661341853035, + "grad_norm": 0.10264059901237488, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2456 + }, + { + "epoch": 1.9624600638977636, + "grad_norm": 0.12051466107368469, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2457 + }, + { + "epoch": 1.9632587859424921, + "grad_norm": 0.27400559186935425, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2458 + }, + { + "epoch": 1.9640575079872205, + "grad_norm": 0.2756473124027252, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2459 + }, + { + "epoch": 1.9648562300319488, + "grad_norm": 0.09925543516874313, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2460 + }, + { + "epoch": 1.9656549520766773, + "grad_norm": 0.18176420032978058, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2461 + }, + { + "epoch": 1.9664536741214058, + "grad_norm": 0.353693425655365, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 2462 + }, + { + "epoch": 1.9672523961661343, + "grad_norm": 0.30674099922180176, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2463 + }, + { + "epoch": 1.9680511182108626, + "grad_norm": 0.04689846560359001, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2464 + }, + { + "epoch": 1.968849840255591, + "grad_norm": 0.29758918285369873, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2465 + }, + { + "epoch": 1.9696485623003195, + "grad_norm": 0.363922655582428, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2466 + }, + { + "epoch": 1.970447284345048, + "grad_norm": 0.19258317351341248, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2467 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 0.10317967087030411, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2468 + }, + { + "epoch": 1.9720447284345048, + "grad_norm": 0.2375856637954712, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2469 + }, + { + "epoch": 1.9728434504792332, + "grad_norm": 0.13130125403404236, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2470 + }, + { + "epoch": 1.9736421725239617, + "grad_norm": 0.08131767064332962, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2471 + }, + { + "epoch": 1.9744408945686902, + "grad_norm": 0.14860530197620392, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2472 + }, + { + "epoch": 1.9752396166134185, + "grad_norm": 0.11777997016906738, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2473 + }, + { + "epoch": 1.9760383386581468, + "grad_norm": 0.08397025614976883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2474 + }, + { + "epoch": 1.9768370607028753, + "grad_norm": 0.08824057132005692, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2475 + }, + { + "epoch": 1.9776357827476039, + "grad_norm": 0.06647378206253052, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2476 + }, + { + "epoch": 1.9784345047923324, + "grad_norm": 0.038043633103370667, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2477 + }, + { + "epoch": 1.9792332268370607, + "grad_norm": 0.08245793730020523, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2478 + }, + { + "epoch": 1.980031948881789, + "grad_norm": 0.1402815282344818, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2479 + }, + { + "epoch": 1.9808306709265175, + "grad_norm": 0.15749140083789825, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2480 + }, + { + "epoch": 1.981629392971246, + "grad_norm": 0.09396994858980179, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2481 + }, + { + "epoch": 1.9824281150159746, + "grad_norm": 0.0725923553109169, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2482 + }, + { + "epoch": 1.983226837060703, + "grad_norm": 0.06790316104888916, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2483 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 0.04050496965646744, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2484 + }, + { + "epoch": 1.9848242811501597, + "grad_norm": 0.04245828837156296, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2485 + }, + { + "epoch": 1.9856230031948883, + "grad_norm": 0.04818668216466904, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2486 + }, + { + "epoch": 1.9864217252396166, + "grad_norm": 0.07091481238603592, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2487 + }, + { + "epoch": 1.9872204472843449, + "grad_norm": 0.08975768834352493, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2488 + }, + { + "epoch": 1.9880191693290734, + "grad_norm": 0.0920509397983551, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2489 + }, + { + "epoch": 1.988817891373802, + "grad_norm": 0.06188343092799187, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2490 + }, + { + "epoch": 1.9896166134185305, + "grad_norm": 0.03998660668730736, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2491 + }, + { + "epoch": 1.9904153354632588, + "grad_norm": 0.03859339654445648, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2492 + }, + { + "epoch": 1.991214057507987, + "grad_norm": 0.050228461623191833, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2493 + }, + { + "epoch": 1.9920127795527156, + "grad_norm": 0.04037710279226303, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2494 + }, + { + "epoch": 1.9928115015974441, + "grad_norm": 0.04584654048085213, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2495 + }, + { + "epoch": 1.9936102236421727, + "grad_norm": 0.03696245700120926, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2496 + }, + { + "epoch": 1.994408945686901, + "grad_norm": 0.04600491747260094, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2497 + }, + { + "epoch": 1.9952076677316293, + "grad_norm": 0.0943571925163269, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2498 + }, + { + "epoch": 1.9960063897763578, + "grad_norm": 0.11350230127573013, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2499 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.09816325455904007, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2500 + }, + { + "epoch": 1.9976038338658149, + "grad_norm": 0.05887974426150322, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2501 + }, + { + "epoch": 1.9984025559105432, + "grad_norm": 0.039232514798641205, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2502 + }, + { + "epoch": 1.9992012779552715, + "grad_norm": 0.10776908695697784, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2503 + }, + { + "epoch": 2.0, + "grad_norm": 0.1708499789237976, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2504 + }, + { + "epoch": 2.0007987220447285, + "grad_norm": 0.12712575495243073, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2505 + }, + { + "epoch": 2.001597444089457, + "grad_norm": 0.04130035266280174, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2506 + }, + { + "epoch": 2.002396166134185, + "grad_norm": 0.08062197268009186, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2507 + }, + { + "epoch": 2.0031948881789137, + "grad_norm": 0.11429931968450546, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2508 + }, + { + "epoch": 2.003993610223642, + "grad_norm": 0.06290867924690247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2509 + }, + { + "epoch": 2.0047923322683707, + "grad_norm": 0.043735455721616745, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2510 + }, + { + "epoch": 2.0055910543130993, + "grad_norm": 0.08331973850727081, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2511 + }, + { + "epoch": 2.0063897763578273, + "grad_norm": 0.07424676418304443, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2512 + }, + { + "epoch": 2.007188498402556, + "grad_norm": 0.0450097881257534, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2513 + }, + { + "epoch": 2.0079872204472844, + "grad_norm": 0.05486248433589935, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2514 + }, + { + "epoch": 2.008785942492013, + "grad_norm": 0.03456762805581093, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2515 + }, + { + "epoch": 2.009584664536741, + "grad_norm": 0.060457173734903336, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2516 + }, + { + "epoch": 2.0103833865814695, + "grad_norm": 0.11361896246671677, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2517 + }, + { + "epoch": 2.011182108626198, + "grad_norm": 0.13272768259048462, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2518 + }, + { + "epoch": 2.0119808306709266, + "grad_norm": 0.06579867750406265, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2519 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.06989869475364685, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2520 + }, + { + "epoch": 2.013578274760383, + "grad_norm": 0.10227718949317932, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2521 + }, + { + "epoch": 2.0143769968051117, + "grad_norm": 0.1155320331454277, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2522 + }, + { + "epoch": 2.0151757188498403, + "grad_norm": 0.08428250998258591, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2523 + }, + { + "epoch": 2.015974440894569, + "grad_norm": 0.07322479784488678, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2524 + }, + { + "epoch": 2.0167731629392973, + "grad_norm": 0.0683116540312767, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2525 + }, + { + "epoch": 2.0175718849840254, + "grad_norm": 0.05594201013445854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2526 + }, + { + "epoch": 2.018370607028754, + "grad_norm": 0.08582351356744766, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2527 + }, + { + "epoch": 2.0191693290734825, + "grad_norm": 0.16223077476024628, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2528 + }, + { + "epoch": 2.019968051118211, + "grad_norm": 0.23563791811466217, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2529 + }, + { + "epoch": 2.0207667731629395, + "grad_norm": 0.2101173847913742, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2530 + }, + { + "epoch": 2.0215654952076676, + "grad_norm": 0.14453741908073425, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2531 + }, + { + "epoch": 2.022364217252396, + "grad_norm": 0.050489380955696106, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2532 + }, + { + "epoch": 2.0231629392971247, + "grad_norm": 0.17723125219345093, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2533 + }, + { + "epoch": 2.023961661341853, + "grad_norm": 0.18600088357925415, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2534 + }, + { + "epoch": 2.0247603833865813, + "grad_norm": 0.10898424685001373, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2535 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.07256787270307541, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2536 + }, + { + "epoch": 2.0263578274760383, + "grad_norm": 0.1978672444820404, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2537 + }, + { + "epoch": 2.027156549520767, + "grad_norm": 0.20623594522476196, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2538 + }, + { + "epoch": 2.0279552715654954, + "grad_norm": 0.08837094157934189, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2539 + }, + { + "epoch": 2.0287539936102235, + "grad_norm": 0.10977557301521301, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2540 + }, + { + "epoch": 2.029552715654952, + "grad_norm": 0.24850067496299744, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2541 + }, + { + "epoch": 2.0303514376996805, + "grad_norm": 0.29207590222358704, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2542 + }, + { + "epoch": 2.031150159744409, + "grad_norm": 0.1985940933227539, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2543 + }, + { + "epoch": 2.0319488817891376, + "grad_norm": 0.04519326612353325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2544 + }, + { + "epoch": 2.0327476038338657, + "grad_norm": 0.16939495503902435, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2545 + }, + { + "epoch": 2.033546325878594, + "grad_norm": 0.270275354385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2546 + }, + { + "epoch": 2.0343450479233227, + "grad_norm": 0.21180108189582825, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2547 + }, + { + "epoch": 2.0351437699680512, + "grad_norm": 0.0469316728413105, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2548 + }, + { + "epoch": 2.0359424920127798, + "grad_norm": 0.1845361739397049, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2549 + }, + { + "epoch": 2.036741214057508, + "grad_norm": 0.2276308536529541, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2550 + }, + { + "epoch": 2.0375399361022364, + "grad_norm": 0.11676277965307236, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2551 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 0.1021813154220581, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2552 + }, + { + "epoch": 2.0391373801916934, + "grad_norm": 0.28504467010498047, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2553 + }, + { + "epoch": 2.0399361022364215, + "grad_norm": 0.2821798324584961, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2554 + }, + { + "epoch": 2.04073482428115, + "grad_norm": 0.09673242270946503, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2555 + }, + { + "epoch": 2.0415335463258786, + "grad_norm": 0.1784241944551468, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2556 + }, + { + "epoch": 2.042332268370607, + "grad_norm": 0.30749815702438354, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2557 + }, + { + "epoch": 2.0431309904153356, + "grad_norm": 0.2625802457332611, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2558 + }, + { + "epoch": 2.0439297124600637, + "grad_norm": 0.0651462972164154, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 2559 + }, + { + "epoch": 2.0447284345047922, + "grad_norm": 0.2103819102048874, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 2560 + }, + { + "epoch": 2.0455271565495208, + "grad_norm": 0.2854102849960327, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2561 + }, + { + "epoch": 2.0463258785942493, + "grad_norm": 0.14184293150901794, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2562 + }, + { + "epoch": 2.047124600638978, + "grad_norm": 0.06151473522186279, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2563 + }, + { + "epoch": 2.047923322683706, + "grad_norm": 0.1858600378036499, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2564 + }, + { + "epoch": 2.0487220447284344, + "grad_norm": 0.19997341930866241, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2565 + }, + { + "epoch": 2.049520766773163, + "grad_norm": 0.0924893170595169, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2566 + }, + { + "epoch": 2.0503194888178915, + "grad_norm": 0.14571507275104523, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2567 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.2566513121128082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2568 + }, + { + "epoch": 2.051916932907348, + "grad_norm": 0.24462486803531647, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2569 + }, + { + "epoch": 2.0527156549520766, + "grad_norm": 0.10544434189796448, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2570 + }, + { + "epoch": 2.053514376996805, + "grad_norm": 0.08675491809844971, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2571 + }, + { + "epoch": 2.0543130990415337, + "grad_norm": 0.18398417532444, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2572 + }, + { + "epoch": 2.055111821086262, + "grad_norm": 0.15167878568172455, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2573 + }, + { + "epoch": 2.0559105431309903, + "grad_norm": 0.06932301074266434, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2574 + }, + { + "epoch": 2.056709265175719, + "grad_norm": 0.06368319690227509, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2575 + }, + { + "epoch": 2.0575079872204474, + "grad_norm": 0.11785905808210373, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2576 + }, + { + "epoch": 2.058306709265176, + "grad_norm": 0.05494855344295502, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2577 + }, + { + "epoch": 2.059105431309904, + "grad_norm": 0.10618741810321808, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2578 + }, + { + "epoch": 2.0599041533546325, + "grad_norm": 0.14729735255241394, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2579 + }, + { + "epoch": 2.060702875399361, + "grad_norm": 0.08014677464962006, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2580 + }, + { + "epoch": 2.0615015974440896, + "grad_norm": 0.07460471242666245, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2581 + }, + { + "epoch": 2.062300319488818, + "grad_norm": 0.12884479761123657, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2582 + }, + { + "epoch": 2.063099041533546, + "grad_norm": 0.11224616318941116, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2583 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.06026687100529671, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2584 + }, + { + "epoch": 2.0646964856230032, + "grad_norm": 0.06690093874931335, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2585 + }, + { + "epoch": 2.0654952076677318, + "grad_norm": 0.10095079988241196, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2586 + }, + { + "epoch": 2.06629392971246, + "grad_norm": 0.08353506028652191, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2587 + }, + { + "epoch": 2.0670926517571884, + "grad_norm": 0.07060668617486954, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2588 + }, + { + "epoch": 2.067891373801917, + "grad_norm": 0.07298587262630463, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2589 + }, + { + "epoch": 2.0686900958466454, + "grad_norm": 0.04319034889340401, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2590 + }, + { + "epoch": 2.069488817891374, + "grad_norm": 0.04229504242539406, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2591 + }, + { + "epoch": 2.070287539936102, + "grad_norm": 0.05476998910307884, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2592 + }, + { + "epoch": 2.0710862619808306, + "grad_norm": 0.039188139140605927, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2593 + }, + { + "epoch": 2.071884984025559, + "grad_norm": 0.058993417769670486, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2594 + }, + { + "epoch": 2.0726837060702876, + "grad_norm": 0.04871759191155434, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2595 + }, + { + "epoch": 2.073482428115016, + "grad_norm": 0.037119925022125244, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2596 + }, + { + "epoch": 2.0742811501597442, + "grad_norm": 0.06476760655641556, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2597 + }, + { + "epoch": 2.0750798722044728, + "grad_norm": 0.03558475151658058, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2598 + }, + { + "epoch": 2.0758785942492013, + "grad_norm": 0.03988872841000557, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2599 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.04446236789226532, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2600 + }, + { + "epoch": 2.0774760383386583, + "grad_norm": 0.058075740933418274, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2601 + }, + { + "epoch": 2.0782747603833864, + "grad_norm": 0.10492820292711258, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2602 + }, + { + "epoch": 2.079073482428115, + "grad_norm": 0.1374005526304245, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2603 + }, + { + "epoch": 2.0798722044728435, + "grad_norm": 0.10932788252830505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 2604 + }, + { + "epoch": 2.080670926517572, + "grad_norm": 0.035826049745082855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2605 + }, + { + "epoch": 2.0814696485623, + "grad_norm": 0.10934802889823914, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2606 + }, + { + "epoch": 2.0822683706070286, + "grad_norm": 0.13302485644817352, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2607 + }, + { + "epoch": 2.083067092651757, + "grad_norm": 0.11253390461206436, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2608 + }, + { + "epoch": 2.0838658146964857, + "grad_norm": 0.04634593054652214, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2609 + }, + { + "epoch": 2.084664536741214, + "grad_norm": 0.21137909591197968, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2610 + }, + { + "epoch": 2.0854632587859423, + "grad_norm": 0.2771414816379547, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2611 + }, + { + "epoch": 2.086261980830671, + "grad_norm": 0.1959906965494156, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2612 + }, + { + "epoch": 2.0870607028753994, + "grad_norm": 0.042694322764873505, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2613 + }, + { + "epoch": 2.087859424920128, + "grad_norm": 0.15753871202468872, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2614 + }, + { + "epoch": 2.0886581469648564, + "grad_norm": 0.1917339563369751, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2615 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.05056089907884598, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2616 + }, + { + "epoch": 2.090255591054313, + "grad_norm": 0.16167999804019928, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2617 + }, + { + "epoch": 2.0910543130990416, + "grad_norm": 0.21019205451011658, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2618 + }, + { + "epoch": 2.09185303514377, + "grad_norm": 0.12859253585338593, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2619 + }, + { + "epoch": 2.0926517571884986, + "grad_norm": 0.04561556130647659, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2620 + }, + { + "epoch": 2.0934504792332267, + "grad_norm": 0.19915086030960083, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2621 + }, + { + "epoch": 2.094249201277955, + "grad_norm": 0.2792043685913086, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2622 + }, + { + "epoch": 2.0950479233226837, + "grad_norm": 0.16861289739608765, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2623 + }, + { + "epoch": 2.0958466453674123, + "grad_norm": 0.08431511372327805, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2624 + }, + { + "epoch": 2.0966453674121404, + "grad_norm": 0.26860734820365906, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2625 + }, + { + "epoch": 2.097444089456869, + "grad_norm": 0.2949545979499817, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2626 + }, + { + "epoch": 2.0982428115015974, + "grad_norm": 0.12639857828617096, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2627 + }, + { + "epoch": 2.099041533546326, + "grad_norm": 0.14675533771514893, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2628 + }, + { + "epoch": 2.0998402555910545, + "grad_norm": 0.29298654198646545, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2629 + }, + { + "epoch": 2.1006389776357826, + "grad_norm": 0.20049460232257843, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2630 + }, + { + "epoch": 2.101437699680511, + "grad_norm": 0.05280651897192001, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2631 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.2405036836862564, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2632 + }, + { + "epoch": 2.103035143769968, + "grad_norm": 0.29925718903541565, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2633 + }, + { + "epoch": 2.1038338658146967, + "grad_norm": 0.1330690085887909, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2634 + }, + { + "epoch": 2.1046325878594248, + "grad_norm": 0.11366300284862518, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2635 + }, + { + "epoch": 2.1054313099041533, + "grad_norm": 0.184611514210701, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2636 + }, + { + "epoch": 2.106230031948882, + "grad_norm": 0.0942547619342804, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2637 + }, + { + "epoch": 2.1070287539936103, + "grad_norm": 0.09224486351013184, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2638 + }, + { + "epoch": 2.107827476038339, + "grad_norm": 0.2167433351278305, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2639 + }, + { + "epoch": 2.108626198083067, + "grad_norm": 0.20001453161239624, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2640 + }, + { + "epoch": 2.1094249201277955, + "grad_norm": 0.0551394522190094, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2641 + }, + { + "epoch": 2.110223642172524, + "grad_norm": 0.14991897344589233, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2642 + }, + { + "epoch": 2.1110223642172525, + "grad_norm": 0.21038007736206055, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2643 + }, + { + "epoch": 2.1118210862619806, + "grad_norm": 0.11942024528980255, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2644 + }, + { + "epoch": 2.112619808306709, + "grad_norm": 0.14938029646873474, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2645 + }, + { + "epoch": 2.1134185303514377, + "grad_norm": 0.3405923843383789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2646 + }, + { + "epoch": 2.114217252396166, + "grad_norm": 0.3363925814628601, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2647 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.12379220873117447, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2648 + }, + { + "epoch": 2.115814696485623, + "grad_norm": 0.1583731323480606, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2649 + }, + { + "epoch": 2.1166134185303513, + "grad_norm": 0.2941076457500458, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2650 + }, + { + "epoch": 2.11741214057508, + "grad_norm": 0.18513287603855133, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2651 + }, + { + "epoch": 2.1182108626198084, + "grad_norm": 0.057797662913799286, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2652 + }, + { + "epoch": 2.119009584664537, + "grad_norm": 0.12461342662572861, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2653 + }, + { + "epoch": 2.119808306709265, + "grad_norm": 0.06276709586381912, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2654 + }, + { + "epoch": 2.1206070287539935, + "grad_norm": 0.06073528528213501, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2655 + }, + { + "epoch": 2.121405750798722, + "grad_norm": 0.07055814564228058, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2656 + }, + { + "epoch": 2.1222044728434506, + "grad_norm": 0.03508429974317551, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2657 + }, + { + "epoch": 2.123003194888179, + "grad_norm": 0.0474206916987896, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2658 + }, + { + "epoch": 2.123801916932907, + "grad_norm": 0.04067448526620865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2659 + }, + { + "epoch": 2.1246006389776357, + "grad_norm": 0.060025133192539215, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2660 + }, + { + "epoch": 2.1253993610223643, + "grad_norm": 0.061696235090494156, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2661 + }, + { + "epoch": 2.126198083067093, + "grad_norm": 0.060907844454050064, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2662 + }, + { + "epoch": 2.126996805111821, + "grad_norm": 0.06122025474905968, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 2663 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.06885300576686859, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2664 + }, + { + "epoch": 2.128594249201278, + "grad_norm": 0.047428976744413376, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2665 + }, + { + "epoch": 2.1293929712460065, + "grad_norm": 0.036644674837589264, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2666 + }, + { + "epoch": 2.130191693290735, + "grad_norm": 0.04983266070485115, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2667 + }, + { + "epoch": 2.130990415335463, + "grad_norm": 0.09072417765855789, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2668 + }, + { + "epoch": 2.1317891373801916, + "grad_norm": 0.10644412785768509, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2669 + }, + { + "epoch": 2.13258785942492, + "grad_norm": 0.07350479066371918, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2670 + }, + { + "epoch": 2.1333865814696487, + "grad_norm": 0.041709840297698975, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2671 + }, + { + "epoch": 2.134185303514377, + "grad_norm": 0.043592557311058044, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2672 + }, + { + "epoch": 2.1349840255591053, + "grad_norm": 0.04548558592796326, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2673 + }, + { + "epoch": 2.135782747603834, + "grad_norm": 0.03937267139554024, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2674 + }, + { + "epoch": 2.1365814696485623, + "grad_norm": 0.05674131214618683, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2675 + }, + { + "epoch": 2.137380191693291, + "grad_norm": 0.0857989713549614, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2676 + }, + { + "epoch": 2.1381789137380194, + "grad_norm": 0.12659871578216553, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2677 + }, + { + "epoch": 2.1389776357827475, + "grad_norm": 0.10000529885292053, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2678 + }, + { + "epoch": 2.139776357827476, + "grad_norm": 0.060805950313806534, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2679 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.20407895743846893, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2680 + }, + { + "epoch": 2.141373801916933, + "grad_norm": 0.21931609511375427, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2681 + }, + { + "epoch": 2.142172523961661, + "grad_norm": 0.0947318896651268, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2682 + }, + { + "epoch": 2.1429712460063897, + "grad_norm": 0.10082453489303589, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2683 + }, + { + "epoch": 2.143769968051118, + "grad_norm": 0.2510482370853424, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2684 + }, + { + "epoch": 2.1445686900958467, + "grad_norm": 0.2802210748195648, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 2685 + }, + { + "epoch": 2.1453674121405752, + "grad_norm": 0.18770602345466614, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2686 + }, + { + "epoch": 2.1461661341853033, + "grad_norm": 0.048588722944259644, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2687 + }, + { + "epoch": 2.146964856230032, + "grad_norm": 0.1443304419517517, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2688 + }, + { + "epoch": 2.1477635782747604, + "grad_norm": 0.22439543902873993, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2689 + }, + { + "epoch": 2.148562300319489, + "grad_norm": 0.16312581300735474, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2690 + }, + { + "epoch": 2.1493610223642174, + "grad_norm": 0.08721408247947693, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2691 + }, + { + "epoch": 2.1501597444089455, + "grad_norm": 0.2756902873516083, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2692 + }, + { + "epoch": 2.150958466453674, + "grad_norm": 0.2834199070930481, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2693 + }, + { + "epoch": 2.1517571884984026, + "grad_norm": 0.1190086081624031, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2694 + }, + { + "epoch": 2.152555910543131, + "grad_norm": 0.1246909499168396, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2695 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.2244880348443985, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2696 + }, + { + "epoch": 2.1541533546325877, + "grad_norm": 0.1424233317375183, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2697 + }, + { + "epoch": 2.1549520766773163, + "grad_norm": 0.10756697505712509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2698 + }, + { + "epoch": 2.155750798722045, + "grad_norm": 0.1688450276851654, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2699 + }, + { + "epoch": 2.1565495207667733, + "grad_norm": 0.12139362096786499, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 2700 + }, + { + "epoch": 2.1573482428115014, + "grad_norm": 0.07833441346883774, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 2701 + }, + { + "epoch": 2.15814696485623, + "grad_norm": 0.22099994122982025, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2702 + }, + { + "epoch": 2.1589456869009584, + "grad_norm": 0.190511554479599, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2703 + }, + { + "epoch": 2.159744408945687, + "grad_norm": 0.07637764513492584, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2704 + }, + { + "epoch": 2.1605431309904155, + "grad_norm": 0.06381702423095703, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2705 + }, + { + "epoch": 2.1613418530351436, + "grad_norm": 0.1343991458415985, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2706 + }, + { + "epoch": 2.162140575079872, + "grad_norm": 0.13090470433235168, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2707 + }, + { + "epoch": 2.1629392971246006, + "grad_norm": 0.04627209156751633, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2708 + }, + { + "epoch": 2.163738019169329, + "grad_norm": 0.060849517583847046, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2709 + }, + { + "epoch": 2.1645367412140577, + "grad_norm": 0.06780707836151123, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2710 + }, + { + "epoch": 2.165335463258786, + "grad_norm": 0.07282490283250809, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 2711 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 0.07168543338775635, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2712 + }, + { + "epoch": 2.166932907348243, + "grad_norm": 0.08716403692960739, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2713 + }, + { + "epoch": 2.1677316293929714, + "grad_norm": 0.09366965293884277, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2714 + }, + { + "epoch": 2.1685303514377, + "grad_norm": 0.09121392667293549, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2715 + }, + { + "epoch": 2.169329073482428, + "grad_norm": 0.06912577152252197, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2716 + }, + { + "epoch": 2.1701277955271565, + "grad_norm": 0.046476542949676514, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2717 + }, + { + "epoch": 2.170926517571885, + "grad_norm": 0.04065564647316933, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2718 + }, + { + "epoch": 2.1717252396166136, + "grad_norm": 0.044998086988925934, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2719 + }, + { + "epoch": 2.1725239616613417, + "grad_norm": 0.04588993638753891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2720 + }, + { + "epoch": 2.17332268370607, + "grad_norm": 0.05954091623425484, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2721 + }, + { + "epoch": 2.1741214057507987, + "grad_norm": 0.07627220451831818, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2722 + }, + { + "epoch": 2.1749201277955272, + "grad_norm": 0.0832771435379982, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2723 + }, + { + "epoch": 2.1757188498402558, + "grad_norm": 0.09901522845029831, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2724 + }, + { + "epoch": 2.176517571884984, + "grad_norm": 0.05773104354739189, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2725 + }, + { + "epoch": 2.1773162939297124, + "grad_norm": 0.0783318281173706, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2726 + }, + { + "epoch": 2.178115015974441, + "grad_norm": 0.12447014451026917, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2727 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.08944697678089142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2728 + }, + { + "epoch": 2.179712460063898, + "grad_norm": 0.07295451313257217, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2729 + }, + { + "epoch": 2.180511182108626, + "grad_norm": 0.1335693746805191, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2730 + }, + { + "epoch": 2.1813099041533546, + "grad_norm": 0.14618094265460968, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2731 + }, + { + "epoch": 2.182108626198083, + "grad_norm": 0.05047796294093132, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2732 + }, + { + "epoch": 2.1829073482428116, + "grad_norm": 0.18955212831497192, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2733 + }, + { + "epoch": 2.18370607028754, + "grad_norm": 0.3394540250301361, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2734 + }, + { + "epoch": 2.1845047923322682, + "grad_norm": 0.34607887268066406, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2735 + }, + { + "epoch": 2.1853035143769968, + "grad_norm": 0.19489939510822296, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2736 + }, + { + "epoch": 2.1861022364217253, + "grad_norm": 0.06775379180908203, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2737 + }, + { + "epoch": 2.186900958466454, + "grad_norm": 0.2376859039068222, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2738 + }, + { + "epoch": 2.187699680511182, + "grad_norm": 0.22686026990413666, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2739 + }, + { + "epoch": 2.1884984025559104, + "grad_norm": 0.059437282383441925, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2740 + }, + { + "epoch": 2.189297124600639, + "grad_norm": 0.184672549366951, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2741 + }, + { + "epoch": 2.1900958466453675, + "grad_norm": 0.21975156664848328, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2742 + }, + { + "epoch": 2.190894568690096, + "grad_norm": 0.08795829117298126, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2743 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.1045440062880516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2744 + }, + { + "epoch": 2.1924920127795526, + "grad_norm": 0.21037985384464264, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2745 + }, + { + "epoch": 2.193290734824281, + "grad_norm": 0.17791713774204254, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2746 + }, + { + "epoch": 2.1940894568690097, + "grad_norm": 0.06028178334236145, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2747 + }, + { + "epoch": 2.194888178913738, + "grad_norm": 0.0801217257976532, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2748 + }, + { + "epoch": 2.1956869009584663, + "grad_norm": 0.11564524471759796, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2749 + }, + { + "epoch": 2.196485623003195, + "grad_norm": 0.0652003139257431, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2750 + }, + { + "epoch": 2.1972843450479234, + "grad_norm": 0.057818979024887085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2751 + }, + { + "epoch": 2.198083067092652, + "grad_norm": 0.10466332733631134, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2752 + }, + { + "epoch": 2.1988817891373804, + "grad_norm": 0.09350129216909409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2753 + }, + { + "epoch": 2.1996805111821085, + "grad_norm": 0.04295926168560982, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2754 + }, + { + "epoch": 2.200479233226837, + "grad_norm": 0.0851534903049469, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2755 + }, + { + "epoch": 2.2012779552715656, + "grad_norm": 0.1857217401266098, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2756 + }, + { + "epoch": 2.202076677316294, + "grad_norm": 0.18267984688282013, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2757 + }, + { + "epoch": 2.202875399361022, + "grad_norm": 0.07249841094017029, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2758 + }, + { + "epoch": 2.2036741214057507, + "grad_norm": 0.14335495233535767, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2759 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.24338914453983307, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2760 + }, + { + "epoch": 2.2052715654952078, + "grad_norm": 0.17772778868675232, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2761 + }, + { + "epoch": 2.2060702875399363, + "grad_norm": 0.04809113219380379, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2762 + }, + { + "epoch": 2.2068690095846644, + "grad_norm": 0.09682228416204453, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2763 + }, + { + "epoch": 2.207667731629393, + "grad_norm": 0.13868102431297302, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2764 + }, + { + "epoch": 2.2084664536741214, + "grad_norm": 0.10956277698278427, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2765 + }, + { + "epoch": 2.20926517571885, + "grad_norm": 0.06163526698946953, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2766 + }, + { + "epoch": 2.2100638977635785, + "grad_norm": 0.14519700407981873, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2767 + }, + { + "epoch": 2.2108626198083066, + "grad_norm": 0.12486071139574051, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2768 + }, + { + "epoch": 2.211661341853035, + "grad_norm": 0.0414549857378006, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2769 + }, + { + "epoch": 2.2124600638977636, + "grad_norm": 0.13828913867473602, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2770 + }, + { + "epoch": 2.213258785942492, + "grad_norm": 0.18277914822101593, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2771 + }, + { + "epoch": 2.2140575079872207, + "grad_norm": 0.15727964043617249, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2772 + }, + { + "epoch": 2.2148562300319488, + "grad_norm": 0.07437993586063385, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2773 + }, + { + "epoch": 2.2156549520766773, + "grad_norm": 0.08192550390958786, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2774 + }, + { + "epoch": 2.216453674121406, + "grad_norm": 0.1804617941379547, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2775 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.18431466817855835, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2776 + }, + { + "epoch": 2.2180511182108624, + "grad_norm": 0.11281057447195053, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2777 + }, + { + "epoch": 2.218849840255591, + "grad_norm": 0.0398496650159359, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2778 + }, + { + "epoch": 2.2196485623003195, + "grad_norm": 0.16930198669433594, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2779 + }, + { + "epoch": 2.220447284345048, + "grad_norm": 0.2384660542011261, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2780 + }, + { + "epoch": 2.2212460063897765, + "grad_norm": 0.18867406249046326, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2781 + }, + { + "epoch": 2.2220447284345046, + "grad_norm": 0.041189488023519516, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2782 + }, + { + "epoch": 2.222843450479233, + "grad_norm": 0.21946212649345398, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2783 + }, + { + "epoch": 2.2236421725239617, + "grad_norm": 0.3394725024700165, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2784 + }, + { + "epoch": 2.22444089456869, + "grad_norm": 0.09503358602523804, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2785 + }, + { + "epoch": 2.2252396166134187, + "grad_norm": 0.180524080991745, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2786 + }, + { + "epoch": 2.226038338658147, + "grad_norm": 0.2961865961551666, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2787 + }, + { + "epoch": 2.2268370607028753, + "grad_norm": 0.25913500785827637, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2788 + }, + { + "epoch": 2.227635782747604, + "grad_norm": 0.08123381435871124, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2789 + }, + { + "epoch": 2.2284345047923324, + "grad_norm": 0.18587692081928253, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2790 + }, + { + "epoch": 2.229233226837061, + "grad_norm": 0.29838815331459045, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2791 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.2115599811077118, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2792 + }, + { + "epoch": 2.2308306709265175, + "grad_norm": 0.04708286374807358, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2793 + }, + { + "epoch": 2.231629392971246, + "grad_norm": 0.224795401096344, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2794 + }, + { + "epoch": 2.2324281150159746, + "grad_norm": 0.2673366665840149, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2795 + }, + { + "epoch": 2.2332268370607027, + "grad_norm": 0.1223720833659172, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2796 + }, + { + "epoch": 2.234025559105431, + "grad_norm": 0.12798862159252167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2797 + }, + { + "epoch": 2.2348242811501597, + "grad_norm": 0.25721317529678345, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2798 + }, + { + "epoch": 2.2356230031948883, + "grad_norm": 0.16970157623291016, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2799 + }, + { + "epoch": 2.236421725239617, + "grad_norm": 0.1311950534582138, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 2800 + }, + { + "epoch": 2.237220447284345, + "grad_norm": 0.32154732942581177, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2801 + }, + { + "epoch": 2.2380191693290734, + "grad_norm": 0.23601645231246948, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2802 + }, + { + "epoch": 2.238817891373802, + "grad_norm": 0.08307314664125443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2803 + }, + { + "epoch": 2.2396166134185305, + "grad_norm": 0.31183329224586487, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 2804 + }, + { + "epoch": 2.2404153354632586, + "grad_norm": 0.27391767501831055, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2805 + }, + { + "epoch": 2.241214057507987, + "grad_norm": 0.07247646898031235, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2806 + }, + { + "epoch": 2.2420127795527156, + "grad_norm": 0.1882690042257309, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2807 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.18179158866405487, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2808 + }, + { + "epoch": 2.2436102236421727, + "grad_norm": 0.10761548578739166, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2809 + }, + { + "epoch": 2.244408945686901, + "grad_norm": 0.3067700266838074, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2810 + }, + { + "epoch": 2.2452076677316293, + "grad_norm": 0.17450691759586334, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 2811 + }, + { + "epoch": 2.246006389776358, + "grad_norm": 0.14480780065059662, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2812 + }, + { + "epoch": 2.2468051118210863, + "grad_norm": 0.3325321078300476, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2813 + }, + { + "epoch": 2.247603833865815, + "grad_norm": 0.26238250732421875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2814 + }, + { + "epoch": 2.248402555910543, + "grad_norm": 0.07829522341489792, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2815 + }, + { + "epoch": 2.2492012779552715, + "grad_norm": 0.269721657037735, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2816 + }, + { + "epoch": 2.25, + "grad_norm": 0.16362956166267395, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2817 + }, + { + "epoch": 2.2507987220447285, + "grad_norm": 0.08129733055830002, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2818 + }, + { + "epoch": 2.251597444089457, + "grad_norm": 0.18430721759796143, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2819 + }, + { + "epoch": 2.252396166134185, + "grad_norm": 0.09634844213724136, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2820 + }, + { + "epoch": 2.2531948881789137, + "grad_norm": 0.08204549551010132, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2821 + }, + { + "epoch": 2.253993610223642, + "grad_norm": 0.1140882819890976, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2822 + }, + { + "epoch": 2.2547923322683707, + "grad_norm": 0.05056345462799072, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2823 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.06505320966243744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2824 + }, + { + "epoch": 2.2563897763578273, + "grad_norm": 0.11316727101802826, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2825 + }, + { + "epoch": 2.257188498402556, + "grad_norm": 0.1036633774638176, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2826 + }, + { + "epoch": 2.2579872204472844, + "grad_norm": 0.0470670685172081, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 2827 + }, + { + "epoch": 2.258785942492013, + "grad_norm": 0.0880327895283699, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2828 + }, + { + "epoch": 2.2595846645367414, + "grad_norm": 0.07664912939071655, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2829 + }, + { + "epoch": 2.2603833865814695, + "grad_norm": 0.049471575766801834, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2830 + }, + { + "epoch": 2.261182108626198, + "grad_norm": 0.04288775101304054, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2831 + }, + { + "epoch": 2.2619808306709266, + "grad_norm": 0.10124537348747253, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2832 + }, + { + "epoch": 2.262779552715655, + "grad_norm": 0.13865061104297638, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2833 + }, + { + "epoch": 2.263578274760383, + "grad_norm": 0.10227467864751816, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2834 + }, + { + "epoch": 2.2643769968051117, + "grad_norm": 0.050575822591781616, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 2835 + }, + { + "epoch": 2.2651757188498403, + "grad_norm": 0.044946715235710144, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2836 + }, + { + "epoch": 2.265974440894569, + "grad_norm": 0.0712895616889, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2837 + }, + { + "epoch": 2.2667731629392973, + "grad_norm": 0.07044374942779541, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2838 + }, + { + "epoch": 2.2675718849840254, + "grad_norm": 0.04518461972475052, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2839 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 0.05259617418050766, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2840 + }, + { + "epoch": 2.2691693290734825, + "grad_norm": 0.0654863640666008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2841 + }, + { + "epoch": 2.269968051118211, + "grad_norm": 0.04345248267054558, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2842 + }, + { + "epoch": 2.270766773162939, + "grad_norm": 0.057224296033382416, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2843 + }, + { + "epoch": 2.2715654952076676, + "grad_norm": 0.11091717332601547, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2844 + }, + { + "epoch": 2.272364217252396, + "grad_norm": 0.11426062136888504, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2845 + }, + { + "epoch": 2.2731629392971247, + "grad_norm": 0.10064966231584549, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2846 + }, + { + "epoch": 2.273961661341853, + "grad_norm": 0.13716623187065125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2847 + }, + { + "epoch": 2.2747603833865817, + "grad_norm": 0.09014318138360977, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2848 + }, + { + "epoch": 2.27555910543131, + "grad_norm": 0.16652478277683258, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2849 + }, + { + "epoch": 2.2763578274760383, + "grad_norm": 0.14217601716518402, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2850 + }, + { + "epoch": 2.277156549520767, + "grad_norm": 0.03895508497953415, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2851 + }, + { + "epoch": 2.2779552715654954, + "grad_norm": 0.17713558673858643, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2852 + }, + { + "epoch": 2.2787539936102235, + "grad_norm": 0.32960572838783264, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 2853 + }, + { + "epoch": 2.279552715654952, + "grad_norm": 0.2481910139322281, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2854 + }, + { + "epoch": 2.2803514376996805, + "grad_norm": 0.06643390655517578, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2855 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.17466357350349426, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2856 + }, + { + "epoch": 2.2819488817891376, + "grad_norm": 0.27781131863594055, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2857 + }, + { + "epoch": 2.2827476038338657, + "grad_norm": 0.19475431740283966, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2858 + }, + { + "epoch": 2.283546325878594, + "grad_norm": 0.07700221985578537, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 2859 + }, + { + "epoch": 2.2843450479233227, + "grad_norm": 0.22520926594734192, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2860 + }, + { + "epoch": 2.2851437699680512, + "grad_norm": 0.18735183775424957, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2861 + }, + { + "epoch": 2.2859424920127793, + "grad_norm": 0.04133198782801628, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2862 + }, + { + "epoch": 2.286741214057508, + "grad_norm": 0.2526150941848755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2863 + }, + { + "epoch": 2.2875399361022364, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2864 + }, + { + "epoch": 2.288338658146965, + "grad_norm": 0.12839898467063904, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2865 + }, + { + "epoch": 2.2891373801916934, + "grad_norm": 0.1259411871433258, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2866 + }, + { + "epoch": 2.289936102236422, + "grad_norm": 0.25480905175209045, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2867 + }, + { + "epoch": 2.29073482428115, + "grad_norm": 0.15650653839111328, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2868 + }, + { + "epoch": 2.2915335463258786, + "grad_norm": 0.07474946230649948, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2869 + }, + { + "epoch": 2.292332268370607, + "grad_norm": 0.170192688703537, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2870 + }, + { + "epoch": 2.2931309904153356, + "grad_norm": 0.13292376697063446, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2871 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 0.045553866773843765, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2872 + }, + { + "epoch": 2.2947284345047922, + "grad_norm": 0.10853269696235657, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2873 + }, + { + "epoch": 2.2955271565495208, + "grad_norm": 0.09945288300514221, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2874 + }, + { + "epoch": 2.2963258785942493, + "grad_norm": 0.039073117077350616, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2875 + }, + { + "epoch": 2.297124600638978, + "grad_norm": 0.05867530405521393, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2876 + }, + { + "epoch": 2.297923322683706, + "grad_norm": 0.07227179408073425, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2877 + }, + { + "epoch": 2.2987220447284344, + "grad_norm": 0.04456201195716858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2878 + }, + { + "epoch": 2.299520766773163, + "grad_norm": 0.11672481894493103, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2879 + }, + { + "epoch": 2.3003194888178915, + "grad_norm": 0.12335679680109024, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2880 + }, + { + "epoch": 2.3011182108626196, + "grad_norm": 0.043409012258052826, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2881 + }, + { + "epoch": 2.301916932907348, + "grad_norm": 0.09896806627511978, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2882 + }, + { + "epoch": 2.3027156549520766, + "grad_norm": 0.2037963569164276, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2883 + }, + { + "epoch": 2.303514376996805, + "grad_norm": 0.21378903090953827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2884 + }, + { + "epoch": 2.3043130990415337, + "grad_norm": 0.062362927943468094, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2885 + }, + { + "epoch": 2.3051118210862622, + "grad_norm": 0.17370136082172394, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2886 + }, + { + "epoch": 2.3059105431309903, + "grad_norm": 0.23190435767173767, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2887 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.08148342370986938, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2888 + }, + { + "epoch": 2.3075079872204474, + "grad_norm": 0.1596807837486267, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2889 + }, + { + "epoch": 2.308306709265176, + "grad_norm": 0.26396819949150085, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2890 + }, + { + "epoch": 2.309105431309904, + "grad_norm": 0.1509561687707901, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2891 + }, + { + "epoch": 2.3099041533546325, + "grad_norm": 0.09147104620933533, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2892 + }, + { + "epoch": 2.310702875399361, + "grad_norm": 0.23575374484062195, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2893 + }, + { + "epoch": 2.3115015974440896, + "grad_norm": 0.18403767049312592, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2894 + }, + { + "epoch": 2.312300319488818, + "grad_norm": 0.052600763738155365, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2895 + }, + { + "epoch": 2.313099041533546, + "grad_norm": 0.18707415461540222, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2896 + }, + { + "epoch": 2.3138977635782747, + "grad_norm": 0.20824143290519714, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2897 + }, + { + "epoch": 2.3146964856230032, + "grad_norm": 0.0775759220123291, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2898 + }, + { + "epoch": 2.3154952076677318, + "grad_norm": 0.10904766619205475, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2899 + }, + { + "epoch": 2.31629392971246, + "grad_norm": 0.1562514752149582, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2900 + }, + { + "epoch": 2.3170926517571884, + "grad_norm": 0.06689859926700592, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2901 + }, + { + "epoch": 2.317891373801917, + "grad_norm": 0.0887206643819809, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2902 + }, + { + "epoch": 2.3186900958466454, + "grad_norm": 0.13615944981575012, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2903 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.08094146102666855, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2904 + }, + { + "epoch": 2.3202875399361025, + "grad_norm": 0.06734368950128555, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2905 + }, + { + "epoch": 2.3210862619808306, + "grad_norm": 0.17405667901039124, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2906 + }, + { + "epoch": 2.321884984025559, + "grad_norm": 0.23022079467773438, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2907 + }, + { + "epoch": 2.3226837060702876, + "grad_norm": 0.17341896891593933, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2908 + }, + { + "epoch": 2.323482428115016, + "grad_norm": 0.037751875817775726, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2909 + }, + { + "epoch": 2.3242811501597442, + "grad_norm": 0.12434598803520203, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2910 + }, + { + "epoch": 2.3250798722044728, + "grad_norm": 0.11344511806964874, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2911 + }, + { + "epoch": 2.3258785942492013, + "grad_norm": 0.05426390469074249, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2912 + }, + { + "epoch": 2.32667731629393, + "grad_norm": 0.11261611431837082, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2913 + }, + { + "epoch": 2.3274760383386583, + "grad_norm": 0.22023531794548035, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2914 + }, + { + "epoch": 2.3282747603833864, + "grad_norm": 0.2050291895866394, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2915 + }, + { + "epoch": 2.329073482428115, + "grad_norm": 0.05478905141353607, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2916 + }, + { + "epoch": 2.3298722044728435, + "grad_norm": 0.15363283455371857, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2917 + }, + { + "epoch": 2.330670926517572, + "grad_norm": 0.17348943650722504, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2918 + }, + { + "epoch": 2.3314696485623, + "grad_norm": 0.05366649851202965, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2919 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 0.16219462454319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2920 + }, + { + "epoch": 2.333067092651757, + "grad_norm": 0.23911446332931519, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2921 + }, + { + "epoch": 2.3338658146964857, + "grad_norm": 0.12384039163589478, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2922 + }, + { + "epoch": 2.334664536741214, + "grad_norm": 0.08747945725917816, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 2923 + }, + { + "epoch": 2.3354632587859427, + "grad_norm": 0.19737359881401062, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2924 + }, + { + "epoch": 2.336261980830671, + "grad_norm": 0.11312227696180344, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2925 + }, + { + "epoch": 2.3370607028753994, + "grad_norm": 0.09944877028465271, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2926 + }, + { + "epoch": 2.337859424920128, + "grad_norm": 0.23282872140407562, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2927 + }, + { + "epoch": 2.3386581469648564, + "grad_norm": 0.14369411766529083, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2928 + }, + { + "epoch": 2.3394568690095845, + "grad_norm": 0.07267388701438904, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2929 + }, + { + "epoch": 2.340255591054313, + "grad_norm": 0.18751965463161469, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2930 + }, + { + "epoch": 2.3410543130990416, + "grad_norm": 0.20886634290218353, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2931 + }, + { + "epoch": 2.34185303514377, + "grad_norm": 0.11675436794757843, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2932 + }, + { + "epoch": 2.3426517571884986, + "grad_norm": 0.08915580064058304, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2933 + }, + { + "epoch": 2.3434504792332267, + "grad_norm": 0.1534406840801239, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2934 + }, + { + "epoch": 2.344249201277955, + "grad_norm": 0.08791724592447281, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2935 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 0.04647858813405037, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2936 + }, + { + "epoch": 2.3458466453674123, + "grad_norm": 0.09236840158700943, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2937 + }, + { + "epoch": 2.3466453674121404, + "grad_norm": 0.09079006314277649, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2938 + }, + { + "epoch": 2.347444089456869, + "grad_norm": 0.03492455556988716, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2939 + }, + { + "epoch": 2.3482428115015974, + "grad_norm": 0.11871617287397385, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2940 + }, + { + "epoch": 2.349041533546326, + "grad_norm": 0.10904752463102341, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2941 + }, + { + "epoch": 2.3498402555910545, + "grad_norm": 0.05331781879067421, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2942 + }, + { + "epoch": 2.3506389776357826, + "grad_norm": 0.1213313564658165, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2943 + }, + { + "epoch": 2.351437699680511, + "grad_norm": 0.12995922565460205, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 2944 + }, + { + "epoch": 2.3522364217252396, + "grad_norm": 0.05770767107605934, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2945 + }, + { + "epoch": 2.353035143769968, + "grad_norm": 0.09310754388570786, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 2946 + }, + { + "epoch": 2.3538338658146967, + "grad_norm": 0.17539645731449127, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2947 + }, + { + "epoch": 2.3546325878594248, + "grad_norm": 0.14126333594322205, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2948 + }, + { + "epoch": 2.3554313099041533, + "grad_norm": 0.04220091179013252, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2949 + }, + { + "epoch": 2.356230031948882, + "grad_norm": 0.14341594278812408, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2950 + }, + { + "epoch": 2.3570287539936103, + "grad_norm": 0.13884525001049042, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2951 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 0.040859755128622055, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2952 + }, + { + "epoch": 2.358626198083067, + "grad_norm": 0.14475658535957336, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 2953 + }, + { + "epoch": 2.3594249201277955, + "grad_norm": 0.18962377309799194, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2954 + }, + { + "epoch": 2.360223642172524, + "grad_norm": 0.0909075066447258, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2955 + }, + { + "epoch": 2.3610223642172525, + "grad_norm": 0.08225106447935104, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2956 + }, + { + "epoch": 2.3618210862619806, + "grad_norm": 0.1564486026763916, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2957 + }, + { + "epoch": 2.362619808306709, + "grad_norm": 0.08859751373529434, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2958 + }, + { + "epoch": 2.3634185303514377, + "grad_norm": 0.10907880961894989, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2959 + }, + { + "epoch": 2.364217252396166, + "grad_norm": 0.2368745654821396, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2960 + }, + { + "epoch": 2.3650159744408947, + "grad_norm": 0.15427371859550476, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2961 + }, + { + "epoch": 2.365814696485623, + "grad_norm": 0.07661470025777817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2962 + }, + { + "epoch": 2.3666134185303513, + "grad_norm": 0.2368732988834381, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2963 + }, + { + "epoch": 2.36741214057508, + "grad_norm": 0.24830125272274017, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2964 + }, + { + "epoch": 2.3682108626198084, + "grad_norm": 0.06940490007400513, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2965 + }, + { + "epoch": 2.369009584664537, + "grad_norm": 0.18672171235084534, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 2966 + }, + { + "epoch": 2.369808306709265, + "grad_norm": 0.22521120309829712, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2967 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 0.0496690534055233, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2968 + }, + { + "epoch": 2.371405750798722, + "grad_norm": 0.16735650599002838, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2969 + }, + { + "epoch": 2.3722044728434506, + "grad_norm": 0.18583746254444122, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2970 + }, + { + "epoch": 2.373003194888179, + "grad_norm": 0.03828646242618561, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2971 + }, + { + "epoch": 2.373801916932907, + "grad_norm": 0.14302043616771698, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2972 + }, + { + "epoch": 2.3746006389776357, + "grad_norm": 0.14217248558998108, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2973 + }, + { + "epoch": 2.3753993610223643, + "grad_norm": 0.08656741678714752, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2974 + }, + { + "epoch": 2.376198083067093, + "grad_norm": 0.18724001944065094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2975 + }, + { + "epoch": 2.376996805111821, + "grad_norm": 0.21609556674957275, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2976 + }, + { + "epoch": 2.3777955271565494, + "grad_norm": 0.08098721504211426, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2977 + }, + { + "epoch": 2.378594249201278, + "grad_norm": 0.09842941910028458, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2978 + }, + { + "epoch": 2.3793929712460065, + "grad_norm": 0.14060764014720917, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2979 + }, + { + "epoch": 2.380191693290735, + "grad_norm": 0.063141830265522, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2980 + }, + { + "epoch": 2.380990415335463, + "grad_norm": 0.10411619395017624, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2981 + }, + { + "epoch": 2.3817891373801916, + "grad_norm": 0.15445855259895325, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2982 + }, + { + "epoch": 2.38258785942492, + "grad_norm": 0.07754000276327133, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2983 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 0.05312122777104378, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2984 + }, + { + "epoch": 2.384185303514377, + "grad_norm": 0.09916596859693527, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2985 + }, + { + "epoch": 2.3849840255591053, + "grad_norm": 0.12749150395393372, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2986 + }, + { + "epoch": 2.385782747603834, + "grad_norm": 0.054589178413152695, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2987 + }, + { + "epoch": 2.3865814696485623, + "grad_norm": 0.08480732887983322, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2988 + }, + { + "epoch": 2.387380191693291, + "grad_norm": 0.13158805668354034, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 2989 + }, + { + "epoch": 2.3881789137380194, + "grad_norm": 0.11916540563106537, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2990 + }, + { + "epoch": 2.3889776357827475, + "grad_norm": 0.05829031020402908, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2991 + }, + { + "epoch": 2.389776357827476, + "grad_norm": 0.18292354047298431, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2992 + }, + { + "epoch": 2.3905750798722045, + "grad_norm": 0.18494512140750885, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2993 + }, + { + "epoch": 2.391373801916933, + "grad_norm": 0.06371760368347168, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2994 + }, + { + "epoch": 2.392172523961661, + "grad_norm": 0.10157672315835953, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2995 + }, + { + "epoch": 2.3929712460063897, + "grad_norm": 0.13981172442436218, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2996 + }, + { + "epoch": 2.393769968051118, + "grad_norm": 0.07794835418462753, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2997 + }, + { + "epoch": 2.3945686900958467, + "grad_norm": 0.038293492048978806, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2998 + }, + { + "epoch": 2.3953674121405752, + "grad_norm": 0.06315408647060394, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2999 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 0.045907966792583466, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3000 + }, + { + "epoch": 2.396964856230032, + "grad_norm": 0.038717497140169144, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3001 + }, + { + "epoch": 2.3977635782747604, + "grad_norm": 0.0376095287501812, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3002 + }, + { + "epoch": 2.398562300319489, + "grad_norm": 0.05739009007811546, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3003 + }, + { + "epoch": 2.3993610223642174, + "grad_norm": 0.034832656383514404, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3004 + }, + { + "epoch": 2.4001597444089455, + "grad_norm": 0.06432276219129562, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3005 + }, + { + "epoch": 2.400958466453674, + "grad_norm": 0.05443817004561424, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3006 + }, + { + "epoch": 2.4017571884984026, + "grad_norm": 0.04691087454557419, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3007 + }, + { + "epoch": 2.402555910543131, + "grad_norm": 0.04394471272826195, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3008 + }, + { + "epoch": 2.4033546325878596, + "grad_norm": 0.03642019256949425, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3009 + }, + { + "epoch": 2.4041533546325877, + "grad_norm": 0.05891808122396469, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3010 + }, + { + "epoch": 2.4049520766773163, + "grad_norm": 0.04530616104602814, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3011 + }, + { + "epoch": 2.405750798722045, + "grad_norm": 0.0518258772790432, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3012 + }, + { + "epoch": 2.4065495207667733, + "grad_norm": 0.11279664188623428, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3013 + }, + { + "epoch": 2.4073482428115014, + "grad_norm": 0.10047753900289536, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3014 + }, + { + "epoch": 2.40814696485623, + "grad_norm": 0.06645897775888443, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3015 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 0.03372915834188461, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3016 + }, + { + "epoch": 2.409744408945687, + "grad_norm": 0.05353475734591484, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3017 + }, + { + "epoch": 2.4105431309904155, + "grad_norm": 0.038493942469358444, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3018 + }, + { + "epoch": 2.4113418530351436, + "grad_norm": 0.07303082197904587, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3019 + }, + { + "epoch": 2.412140575079872, + "grad_norm": 0.043219298124313354, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3020 + }, + { + "epoch": 2.4129392971246006, + "grad_norm": 0.05016458407044411, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3021 + }, + { + "epoch": 2.413738019169329, + "grad_norm": 0.08490880578756332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3022 + }, + { + "epoch": 2.4145367412140573, + "grad_norm": 0.07245411723852158, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3023 + }, + { + "epoch": 2.415335463258786, + "grad_norm": 0.052343063056468964, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3024 + }, + { + "epoch": 2.4161341853035143, + "grad_norm": 0.13449524343013763, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3025 + }, + { + "epoch": 2.416932907348243, + "grad_norm": 0.13177144527435303, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3026 + }, + { + "epoch": 2.4177316293929714, + "grad_norm": 0.06579594314098358, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 3027 + }, + { + "epoch": 2.4185303514377, + "grad_norm": 0.12716646492481232, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3028 + }, + { + "epoch": 2.419329073482428, + "grad_norm": 0.20006005465984344, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3029 + }, + { + "epoch": 2.4201277955271565, + "grad_norm": 0.16598355770111084, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3030 + }, + { + "epoch": 2.420926517571885, + "grad_norm": 0.06625109165906906, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3031 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.10521841049194336, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3032 + }, + { + "epoch": 2.4225239616613417, + "grad_norm": 0.14134426414966583, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3033 + }, + { + "epoch": 2.42332268370607, + "grad_norm": 0.056669678539037704, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3034 + }, + { + "epoch": 2.4241214057507987, + "grad_norm": 0.052738044410943985, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3035 + }, + { + "epoch": 2.4249201277955272, + "grad_norm": 0.06623729318380356, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3036 + }, + { + "epoch": 2.4257188498402558, + "grad_norm": 0.04038512706756592, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3037 + }, + { + "epoch": 2.426517571884984, + "grad_norm": 0.057600609958171844, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3038 + }, + { + "epoch": 2.4273162939297124, + "grad_norm": 0.08174199610948563, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3039 + }, + { + "epoch": 2.428115015974441, + "grad_norm": 0.07850457727909088, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3040 + }, + { + "epoch": 2.4289137380191694, + "grad_norm": 0.04368523135781288, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3041 + }, + { + "epoch": 2.4297124600638975, + "grad_norm": 0.11637478321790695, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3042 + }, + { + "epoch": 2.430511182108626, + "grad_norm": 0.09765078872442245, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3043 + }, + { + "epoch": 2.4313099041533546, + "grad_norm": 0.04842933267354965, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3044 + }, + { + "epoch": 2.432108626198083, + "grad_norm": 0.08858928829431534, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3045 + }, + { + "epoch": 2.4329073482428116, + "grad_norm": 0.12645326554775238, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3046 + }, + { + "epoch": 2.43370607028754, + "grad_norm": 0.09839878976345062, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 3047 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 0.04484904557466507, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3048 + }, + { + "epoch": 2.4353035143769968, + "grad_norm": 0.13912586867809296, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3049 + }, + { + "epoch": 2.4361022364217253, + "grad_norm": 0.18569444119930267, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3050 + }, + { + "epoch": 2.436900958466454, + "grad_norm": 0.13544169068336487, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3051 + }, + { + "epoch": 2.437699680511182, + "grad_norm": 0.04663483425974846, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3052 + }, + { + "epoch": 2.4384984025559104, + "grad_norm": 0.11609578132629395, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3053 + }, + { + "epoch": 2.439297124600639, + "grad_norm": 0.17497499287128448, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3054 + }, + { + "epoch": 2.4400958466453675, + "grad_norm": 0.19216352701187134, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3055 + }, + { + "epoch": 2.440894568690096, + "grad_norm": 0.11638841032981873, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3056 + }, + { + "epoch": 2.441693290734824, + "grad_norm": 0.05816149711608887, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3057 + }, + { + "epoch": 2.4424920127795526, + "grad_norm": 0.1650087982416153, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3058 + }, + { + "epoch": 2.443290734824281, + "grad_norm": 0.2105383425951004, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3059 + }, + { + "epoch": 2.4440894568690097, + "grad_norm": 0.133597731590271, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3060 + }, + { + "epoch": 2.4448881789137378, + "grad_norm": 0.03882076218724251, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3061 + }, + { + "epoch": 2.4456869009584663, + "grad_norm": 0.08914566785097122, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3062 + }, + { + "epoch": 2.446485623003195, + "grad_norm": 0.08115291595458984, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3063 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 0.0402134470641613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3064 + }, + { + "epoch": 2.448083067092652, + "grad_norm": 0.12838906049728394, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3065 + }, + { + "epoch": 2.4488817891373804, + "grad_norm": 0.1865018606185913, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3066 + }, + { + "epoch": 2.4496805111821085, + "grad_norm": 0.13134929537773132, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3067 + }, + { + "epoch": 2.450479233226837, + "grad_norm": 0.05415928363800049, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3068 + }, + { + "epoch": 2.4512779552715656, + "grad_norm": 0.0739838033914566, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3069 + }, + { + "epoch": 2.452076677316294, + "grad_norm": 0.07965957373380661, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 3070 + }, + { + "epoch": 2.452875399361022, + "grad_norm": 0.0416380800306797, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3071 + }, + { + "epoch": 2.4536741214057507, + "grad_norm": 0.03494519367814064, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3072 + }, + { + "epoch": 2.4544728434504792, + "grad_norm": 0.050772733986377716, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3073 + }, + { + "epoch": 2.4552715654952078, + "grad_norm": 0.03939373791217804, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3074 + }, + { + "epoch": 2.4560702875399363, + "grad_norm": 0.11769624799489975, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3075 + }, + { + "epoch": 2.4568690095846644, + "grad_norm": 0.33884114027023315, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3076 + }, + { + "epoch": 2.457667731629393, + "grad_norm": 0.07171089947223663, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3077 + }, + { + "epoch": 2.4584664536741214, + "grad_norm": 0.0707232877612114, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3078 + }, + { + "epoch": 2.45926517571885, + "grad_norm": 0.14245279133319855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3079 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.12356095761060715, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3080 + }, + { + "epoch": 2.4608626198083066, + "grad_norm": 0.0694037601351738, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3081 + }, + { + "epoch": 2.461661341853035, + "grad_norm": 0.0511220321059227, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3082 + }, + { + "epoch": 2.4624600638977636, + "grad_norm": 0.10915348678827286, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3083 + }, + { + "epoch": 2.463258785942492, + "grad_norm": 0.10797106474637985, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3084 + }, + { + "epoch": 2.4640575079872207, + "grad_norm": 0.05721200630068779, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3085 + }, + { + "epoch": 2.4648562300319488, + "grad_norm": 0.04477681592106819, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3086 + }, + { + "epoch": 2.4656549520766773, + "grad_norm": 0.08826448023319244, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3087 + }, + { + "epoch": 2.466453674121406, + "grad_norm": 0.1024692952632904, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3088 + }, + { + "epoch": 2.4672523961661343, + "grad_norm": 0.06543146073818207, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3089 + }, + { + "epoch": 2.4680511182108624, + "grad_norm": 0.06146182119846344, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3090 + }, + { + "epoch": 2.468849840255591, + "grad_norm": 0.12857408821582794, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3091 + }, + { + "epoch": 2.4696485623003195, + "grad_norm": 0.12273124605417252, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3092 + }, + { + "epoch": 2.470447284345048, + "grad_norm": 0.06467662751674652, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3093 + }, + { + "epoch": 2.4712460063897765, + "grad_norm": 0.07181179523468018, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3094 + }, + { + "epoch": 2.4720447284345046, + "grad_norm": 0.20223456621170044, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3095 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 0.25061357021331787, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3096 + }, + { + "epoch": 2.4736421725239617, + "grad_norm": 0.16317492723464966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3097 + }, + { + "epoch": 2.47444089456869, + "grad_norm": 0.04005994647741318, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3098 + }, + { + "epoch": 2.4752396166134183, + "grad_norm": 0.15954583883285522, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3099 + }, + { + "epoch": 2.476038338658147, + "grad_norm": 0.2088920623064041, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3100 + }, + { + "epoch": 2.4768370607028753, + "grad_norm": 0.11643055826425552, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3101 + }, + { + "epoch": 2.477635782747604, + "grad_norm": 0.11083687841892242, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3102 + }, + { + "epoch": 2.4784345047923324, + "grad_norm": 0.24777425825595856, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3103 + }, + { + "epoch": 2.479233226837061, + "grad_norm": 0.19513146579265594, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3104 + }, + { + "epoch": 2.480031948881789, + "grad_norm": 0.05009200796484947, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3105 + }, + { + "epoch": 2.4808306709265175, + "grad_norm": 0.2673046588897705, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3106 + }, + { + "epoch": 2.481629392971246, + "grad_norm": 0.3035629093647003, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3107 + }, + { + "epoch": 2.4824281150159746, + "grad_norm": 0.13213352859020233, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3108 + }, + { + "epoch": 2.4832268370607027, + "grad_norm": 0.13605083525180817, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3109 + }, + { + "epoch": 2.484025559105431, + "grad_norm": 0.2958623170852661, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3110 + }, + { + "epoch": 2.4848242811501597, + "grad_norm": 0.23080390691757202, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3111 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 0.046950701624155045, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3112 + }, + { + "epoch": 2.486421725239617, + "grad_norm": 0.24903765320777893, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3113 + }, + { + "epoch": 2.487220447284345, + "grad_norm": 0.233968585729599, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3114 + }, + { + "epoch": 2.4880191693290734, + "grad_norm": 0.04709520563483238, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3115 + }, + { + "epoch": 2.488817891373802, + "grad_norm": 0.16599629819393158, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3116 + }, + { + "epoch": 2.4896166134185305, + "grad_norm": 0.19273866713047028, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3117 + }, + { + "epoch": 2.4904153354632586, + "grad_norm": 0.11514598876237869, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3118 + }, + { + "epoch": 2.491214057507987, + "grad_norm": 0.08656881004571915, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 3119 + }, + { + "epoch": 2.4920127795527156, + "grad_norm": 0.18213899433612823, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3120 + }, + { + "epoch": 2.492811501597444, + "grad_norm": 0.11029175668954849, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3121 + }, + { + "epoch": 2.4936102236421727, + "grad_norm": 0.04480903223156929, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3122 + }, + { + "epoch": 2.494408945686901, + "grad_norm": 0.04919225722551346, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3123 + }, + { + "epoch": 2.4952076677316293, + "grad_norm": 0.06349056959152222, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3124 + }, + { + "epoch": 2.496006389776358, + "grad_norm": 0.04066464304924011, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3125 + }, + { + "epoch": 2.4968051118210863, + "grad_norm": 0.03992457687854767, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3126 + }, + { + "epoch": 2.497603833865815, + "grad_norm": 0.04580394923686981, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3127 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.13679265975952148, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3128 + }, + { + "epoch": 2.4992012779552715, + "grad_norm": 0.20708884298801422, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3129 + }, + { + "epoch": 2.5, + "grad_norm": 0.22991639375686646, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3130 + }, + { + "epoch": 2.5007987220447285, + "grad_norm": 0.15380895137786865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3131 + }, + { + "epoch": 2.501597444089457, + "grad_norm": 0.05112789571285248, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3132 + }, + { + "epoch": 2.502396166134185, + "grad_norm": 0.19797906279563904, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3133 + }, + { + "epoch": 2.5031948881789137, + "grad_norm": 0.18190141022205353, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3134 + }, + { + "epoch": 2.503993610223642, + "grad_norm": 0.04291468858718872, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3135 + }, + { + "epoch": 2.5047923322683707, + "grad_norm": 0.14576731622219086, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3136 + }, + { + "epoch": 2.505591054313099, + "grad_norm": 0.25093281269073486, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3137 + }, + { + "epoch": 2.5063897763578273, + "grad_norm": 0.22738556563854218, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3138 + }, + { + "epoch": 2.507188498402556, + "grad_norm": 0.08985915035009384, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3139 + }, + { + "epoch": 2.5079872204472844, + "grad_norm": 0.09632397443056107, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3140 + }, + { + "epoch": 2.508785942492013, + "grad_norm": 0.12138333916664124, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3141 + }, + { + "epoch": 2.5095846645367414, + "grad_norm": 0.04163306951522827, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3142 + }, + { + "epoch": 2.5103833865814695, + "grad_norm": 0.06187185272574425, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3143 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.09463546425104141, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3144 + }, + { + "epoch": 2.5119808306709266, + "grad_norm": 0.12386980652809143, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3145 + }, + { + "epoch": 2.512779552715655, + "grad_norm": 0.07090163975954056, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3146 + }, + { + "epoch": 2.513578274760383, + "grad_norm": 0.04502219334244728, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3147 + }, + { + "epoch": 2.5143769968051117, + "grad_norm": 0.08453603833913803, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3148 + }, + { + "epoch": 2.5151757188498403, + "grad_norm": 0.08686821907758713, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3149 + }, + { + "epoch": 2.515974440894569, + "grad_norm": 0.03968734294176102, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3150 + }, + { + "epoch": 2.5167731629392973, + "grad_norm": 0.08613990992307663, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3151 + }, + { + "epoch": 2.5175718849840254, + "grad_norm": 0.07950794696807861, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3152 + }, + { + "epoch": 2.518370607028754, + "grad_norm": 0.0449741929769516, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3153 + }, + { + "epoch": 2.5191693290734825, + "grad_norm": 0.09032034873962402, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3154 + }, + { + "epoch": 2.519968051118211, + "grad_norm": 0.06834430247545242, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3155 + }, + { + "epoch": 2.520766773162939, + "grad_norm": 0.13820379972457886, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3156 + }, + { + "epoch": 2.5215654952076676, + "grad_norm": 0.17753586173057556, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3157 + }, + { + "epoch": 2.522364217252396, + "grad_norm": 0.2663286626338959, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3158 + }, + { + "epoch": 2.5231629392971247, + "grad_norm": 0.21509577333927155, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3159 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.04614022746682167, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3160 + }, + { + "epoch": 2.5247603833865817, + "grad_norm": 0.13719527423381805, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3161 + }, + { + "epoch": 2.52555910543131, + "grad_norm": 0.20119087398052216, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3162 + }, + { + "epoch": 2.5263578274760383, + "grad_norm": 0.1822054237127304, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3163 + }, + { + "epoch": 2.527156549520767, + "grad_norm": 0.06550543755292892, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3164 + }, + { + "epoch": 2.527955271565495, + "grad_norm": 0.08079471439123154, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3165 + }, + { + "epoch": 2.5287539936102235, + "grad_norm": 0.10106988251209259, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3166 + }, + { + "epoch": 2.529552715654952, + "grad_norm": 0.06818784028291702, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3167 + }, + { + "epoch": 2.5303514376996805, + "grad_norm": 0.05976718291640282, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3168 + }, + { + "epoch": 2.531150159744409, + "grad_norm": 0.18163853883743286, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3169 + }, + { + "epoch": 2.5319488817891376, + "grad_norm": 0.26418858766555786, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3170 + }, + { + "epoch": 2.5327476038338657, + "grad_norm": 0.24044150114059448, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3171 + }, + { + "epoch": 2.533546325878594, + "grad_norm": 0.07499254494905472, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3172 + }, + { + "epoch": 2.5343450479233227, + "grad_norm": 0.17483314871788025, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3173 + }, + { + "epoch": 2.5351437699680512, + "grad_norm": 0.2698160707950592, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3174 + }, + { + "epoch": 2.5359424920127793, + "grad_norm": 0.2116270661354065, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3175 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.0545198880136013, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3176 + }, + { + "epoch": 2.5375399361022364, + "grad_norm": 0.1926649659872055, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3177 + }, + { + "epoch": 2.538338658146965, + "grad_norm": 0.24152790009975433, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3178 + }, + { + "epoch": 2.5391373801916934, + "grad_norm": 0.12380969524383545, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3179 + }, + { + "epoch": 2.539936102236422, + "grad_norm": 0.07934054732322693, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3180 + }, + { + "epoch": 2.54073482428115, + "grad_norm": 0.13688413798809052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3181 + }, + { + "epoch": 2.5415335463258786, + "grad_norm": 0.05832000821828842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3182 + }, + { + "epoch": 2.542332268370607, + "grad_norm": 0.08729993551969528, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3183 + }, + { + "epoch": 2.543130990415335, + "grad_norm": 0.16843630373477936, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3184 + }, + { + "epoch": 2.5439297124600637, + "grad_norm": 0.13045506179332733, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3185 + }, + { + "epoch": 2.5447284345047922, + "grad_norm": 0.038882140070199966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3186 + }, + { + "epoch": 2.5455271565495208, + "grad_norm": 0.14922545850276947, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3187 + }, + { + "epoch": 2.5463258785942493, + "grad_norm": 0.1961440145969391, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3188 + }, + { + "epoch": 2.547124600638978, + "grad_norm": 0.08585302531719208, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3189 + }, + { + "epoch": 2.547923322683706, + "grad_norm": 0.13141697645187378, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3190 + }, + { + "epoch": 2.5487220447284344, + "grad_norm": 0.20332233607769012, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3191 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.1740144044160843, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3192 + }, + { + "epoch": 2.5503194888178915, + "grad_norm": 0.04738207906484604, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3193 + }, + { + "epoch": 2.5511182108626196, + "grad_norm": 0.23204317688941956, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3194 + }, + { + "epoch": 2.551916932907348, + "grad_norm": 0.29033714532852173, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3195 + }, + { + "epoch": 2.5527156549520766, + "grad_norm": 0.1251334547996521, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3196 + }, + { + "epoch": 2.553514376996805, + "grad_norm": 0.1610727608203888, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3197 + }, + { + "epoch": 2.5543130990415337, + "grad_norm": 0.284105509519577, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3198 + }, + { + "epoch": 2.5551118210862622, + "grad_norm": 0.1530643254518509, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 3199 + }, + { + "epoch": 2.5559105431309903, + "grad_norm": 0.07761498540639877, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3200 + }, + { + "epoch": 2.556709265175719, + "grad_norm": 0.16693277657032013, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3201 + }, + { + "epoch": 2.5575079872204474, + "grad_norm": 0.06345608085393906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3202 + }, + { + "epoch": 2.5583067092651754, + "grad_norm": 0.10956210643053055, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3203 + }, + { + "epoch": 2.559105431309904, + "grad_norm": 0.17655007541179657, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3204 + }, + { + "epoch": 2.5599041533546325, + "grad_norm": 0.12615050375461578, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3205 + }, + { + "epoch": 2.560702875399361, + "grad_norm": 0.049671441316604614, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3206 + }, + { + "epoch": 2.5615015974440896, + "grad_norm": 0.16559815406799316, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3207 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 0.1279190182685852, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3208 + }, + { + "epoch": 2.563099041533546, + "grad_norm": 0.0540652722120285, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3209 + }, + { + "epoch": 2.5638977635782747, + "grad_norm": 0.1287074238061905, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 3210 + }, + { + "epoch": 2.5646964856230032, + "grad_norm": 0.1118067055940628, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3211 + }, + { + "epoch": 2.5654952076677318, + "grad_norm": 0.05159451439976692, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3212 + }, + { + "epoch": 2.56629392971246, + "grad_norm": 0.10654652118682861, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3213 + }, + { + "epoch": 2.5670926517571884, + "grad_norm": 0.15669982135295868, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3214 + }, + { + "epoch": 2.567891373801917, + "grad_norm": 0.11388157308101654, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3215 + }, + { + "epoch": 2.5686900958466454, + "grad_norm": 0.06434119492769241, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3216 + }, + { + "epoch": 2.569488817891374, + "grad_norm": 0.050070468336343765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3217 + }, + { + "epoch": 2.5702875399361025, + "grad_norm": 0.0522335022687912, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3218 + }, + { + "epoch": 2.5710862619808306, + "grad_norm": 0.04716494306921959, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3219 + }, + { + "epoch": 2.571884984025559, + "grad_norm": 0.03770711272954941, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3220 + }, + { + "epoch": 2.5726837060702876, + "grad_norm": 0.03955485299229622, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3221 + }, + { + "epoch": 2.5734824281150157, + "grad_norm": 0.03824841231107712, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3222 + }, + { + "epoch": 2.5742811501597442, + "grad_norm": 0.04722970351576805, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3223 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 0.05470758676528931, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3224 + }, + { + "epoch": 2.5758785942492013, + "grad_norm": 0.04934269189834595, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3225 + }, + { + "epoch": 2.57667731629393, + "grad_norm": 0.040627289563417435, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3226 + }, + { + "epoch": 2.5774760383386583, + "grad_norm": 0.05668056383728981, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3227 + }, + { + "epoch": 2.5782747603833864, + "grad_norm": 0.11724753677845001, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3228 + }, + { + "epoch": 2.579073482428115, + "grad_norm": 0.12204517424106598, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3229 + }, + { + "epoch": 2.5798722044728435, + "grad_norm": 0.10652083158493042, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3230 + }, + { + "epoch": 2.580670926517572, + "grad_norm": 0.07430299371480942, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3231 + }, + { + "epoch": 2.5814696485623, + "grad_norm": 0.03460770472884178, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3232 + }, + { + "epoch": 2.5822683706070286, + "grad_norm": 0.080150306224823, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3233 + }, + { + "epoch": 2.583067092651757, + "grad_norm": 0.1291198879480362, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3234 + }, + { + "epoch": 2.5838658146964857, + "grad_norm": 0.19541533291339874, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3235 + }, + { + "epoch": 2.584664536741214, + "grad_norm": 0.24089939892292023, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3236 + }, + { + "epoch": 2.5854632587859427, + "grad_norm": 0.1933099627494812, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3237 + }, + { + "epoch": 2.586261980830671, + "grad_norm": 0.07295489311218262, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3238 + }, + { + "epoch": 2.5870607028753994, + "grad_norm": 0.10686071962118149, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3239 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 0.17052637040615082, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3240 + }, + { + "epoch": 2.588658146964856, + "grad_norm": 0.12377535551786423, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3241 + }, + { + "epoch": 2.5894568690095845, + "grad_norm": 0.03730800375342369, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3242 + }, + { + "epoch": 2.590255591054313, + "grad_norm": 0.13848428428173065, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3243 + }, + { + "epoch": 2.5910543130990416, + "grad_norm": 0.18361017107963562, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3244 + }, + { + "epoch": 2.59185303514377, + "grad_norm": 0.11140795797109604, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3245 + }, + { + "epoch": 2.5926517571884986, + "grad_norm": 0.033891428261995316, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3246 + }, + { + "epoch": 2.5934504792332267, + "grad_norm": 0.13179628551006317, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3247 + }, + { + "epoch": 2.594249201277955, + "grad_norm": 0.19785374402999878, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3248 + }, + { + "epoch": 2.5950479233226837, + "grad_norm": 0.15991398692131042, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3249 + }, + { + "epoch": 2.5958466453674123, + "grad_norm": 0.0702645480632782, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3250 + }, + { + "epoch": 2.5966453674121404, + "grad_norm": 0.038220152258872986, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3251 + }, + { + "epoch": 2.597444089456869, + "grad_norm": 0.048042308539152145, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3252 + }, + { + "epoch": 2.5982428115015974, + "grad_norm": 0.05673132464289665, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3253 + }, + { + "epoch": 2.599041533546326, + "grad_norm": 0.057284750044345856, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3254 + }, + { + "epoch": 2.5998402555910545, + "grad_norm": 0.052904874086380005, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3255 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.04914860427379608, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3256 + }, + { + "epoch": 2.601437699680511, + "grad_norm": 0.08870472013950348, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3257 + }, + { + "epoch": 2.6022364217252396, + "grad_norm": 0.09863728284835815, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3258 + }, + { + "epoch": 2.603035143769968, + "grad_norm": 0.08116353303194046, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3259 + }, + { + "epoch": 2.6038338658146962, + "grad_norm": 0.043653007596731186, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3260 + }, + { + "epoch": 2.6046325878594248, + "grad_norm": 0.0579618401825428, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3261 + }, + { + "epoch": 2.6054313099041533, + "grad_norm": 0.08072935789823532, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3262 + }, + { + "epoch": 2.606230031948882, + "grad_norm": 0.05391686409711838, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3263 + }, + { + "epoch": 2.6070287539936103, + "grad_norm": 0.03471128270030022, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3264 + }, + { + "epoch": 2.607827476038339, + "grad_norm": 0.056328870356082916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3265 + }, + { + "epoch": 2.608626198083067, + "grad_norm": 0.05196002125740051, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3266 + }, + { + "epoch": 2.6094249201277955, + "grad_norm": 0.04338999465107918, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3267 + }, + { + "epoch": 2.610223642172524, + "grad_norm": 0.12365762889385223, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3268 + }, + { + "epoch": 2.6110223642172525, + "grad_norm": 0.19469699263572693, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3269 + }, + { + "epoch": 2.6118210862619806, + "grad_norm": 0.1825639009475708, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3270 + }, + { + "epoch": 2.612619808306709, + "grad_norm": 0.10235249251127243, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3271 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 0.05571124702692032, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3272 + }, + { + "epoch": 2.614217252396166, + "grad_norm": 0.1536952704191208, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3273 + }, + { + "epoch": 2.6150159744408947, + "grad_norm": 0.163212850689888, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3274 + }, + { + "epoch": 2.6158146964856233, + "grad_norm": 0.09640593826770782, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3275 + }, + { + "epoch": 2.6166134185303513, + "grad_norm": 0.04329126700758934, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3276 + }, + { + "epoch": 2.61741214057508, + "grad_norm": 0.03598733991384506, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3277 + }, + { + "epoch": 2.6182108626198084, + "grad_norm": 0.046664439141750336, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3278 + }, + { + "epoch": 2.6190095846645365, + "grad_norm": 0.03692904859781265, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3279 + }, + { + "epoch": 2.619808306709265, + "grad_norm": 0.0482964888215065, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3280 + }, + { + "epoch": 2.6206070287539935, + "grad_norm": 0.07996834069490433, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3281 + }, + { + "epoch": 2.621405750798722, + "grad_norm": 0.060141101479530334, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3282 + }, + { + "epoch": 2.6222044728434506, + "grad_norm": 0.04013051837682724, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 3283 + }, + { + "epoch": 2.623003194888179, + "grad_norm": 0.04011296480894089, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3284 + }, + { + "epoch": 2.623801916932907, + "grad_norm": 0.04112064838409424, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3285 + }, + { + "epoch": 2.6246006389776357, + "grad_norm": 0.057281915098428726, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3286 + }, + { + "epoch": 2.6253993610223643, + "grad_norm": 0.06061771139502525, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3287 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 0.05844549089670181, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3288 + }, + { + "epoch": 2.626996805111821, + "grad_norm": 0.06354600191116333, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3289 + }, + { + "epoch": 2.6277955271565494, + "grad_norm": 0.04568248987197876, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3290 + }, + { + "epoch": 2.628594249201278, + "grad_norm": 0.04340318217873573, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3291 + }, + { + "epoch": 2.6293929712460065, + "grad_norm": 0.07078617066144943, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3292 + }, + { + "epoch": 2.630191693290735, + "grad_norm": 0.09865503013134003, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3293 + }, + { + "epoch": 2.6309904153354635, + "grad_norm": 0.08623871207237244, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3294 + }, + { + "epoch": 2.6317891373801916, + "grad_norm": 0.03787717968225479, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3295 + }, + { + "epoch": 2.63258785942492, + "grad_norm": 0.14653000235557556, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3296 + }, + { + "epoch": 2.6333865814696487, + "grad_norm": 0.2749452292919159, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3297 + }, + { + "epoch": 2.6341853035143767, + "grad_norm": 0.28424543142318726, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3298 + }, + { + "epoch": 2.6349840255591053, + "grad_norm": 0.17354224622249603, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3299 + }, + { + "epoch": 2.635782747603834, + "grad_norm": 0.04208464175462723, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3300 + }, + { + "epoch": 2.6365814696485623, + "grad_norm": 0.15522420406341553, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3301 + }, + { + "epoch": 2.637380191693291, + "grad_norm": 0.17986370623111725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3302 + }, + { + "epoch": 2.6381789137380194, + "grad_norm": 0.07155515998601913, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3303 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.11287503689527512, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3304 + }, + { + "epoch": 2.639776357827476, + "grad_norm": 0.22735139727592468, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3305 + }, + { + "epoch": 2.6405750798722045, + "grad_norm": 0.23528814315795898, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3306 + }, + { + "epoch": 2.641373801916933, + "grad_norm": 0.13828198611736298, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3307 + }, + { + "epoch": 2.642172523961661, + "grad_norm": 0.046783462166786194, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3308 + }, + { + "epoch": 2.6429712460063897, + "grad_norm": 0.13010001182556152, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3309 + }, + { + "epoch": 2.643769968051118, + "grad_norm": 0.12339942902326584, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3310 + }, + { + "epoch": 2.6445686900958467, + "grad_norm": 0.06443019211292267, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3311 + }, + { + "epoch": 2.6453674121405752, + "grad_norm": 0.05086766183376312, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3312 + }, + { + "epoch": 2.6461661341853038, + "grad_norm": 0.1266956627368927, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3313 + }, + { + "epoch": 2.646964856230032, + "grad_norm": 0.1238899901509285, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3314 + }, + { + "epoch": 2.6477635782747604, + "grad_norm": 0.07378736138343811, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3315 + }, + { + "epoch": 2.648562300319489, + "grad_norm": 0.12572194635868073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3316 + }, + { + "epoch": 2.649361022364217, + "grad_norm": 0.18099260330200195, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3317 + }, + { + "epoch": 2.6501597444089455, + "grad_norm": 0.1383541077375412, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3318 + }, + { + "epoch": 2.650958466453674, + "grad_norm": 0.043900374323129654, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3319 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 0.13228318095207214, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3320 + }, + { + "epoch": 2.652555910543131, + "grad_norm": 0.11684399843215942, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3321 + }, + { + "epoch": 2.6533546325878596, + "grad_norm": 0.03879965469241142, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3322 + }, + { + "epoch": 2.6541533546325877, + "grad_norm": 0.1457953006029129, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3323 + }, + { + "epoch": 2.6549520766773163, + "grad_norm": 0.21643802523612976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3324 + }, + { + "epoch": 2.655750798722045, + "grad_norm": 0.20250067114830017, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3325 + }, + { + "epoch": 2.6565495207667733, + "grad_norm": 0.09131773561239243, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3326 + }, + { + "epoch": 2.6573482428115014, + "grad_norm": 0.07217761129140854, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3327 + }, + { + "epoch": 2.65814696485623, + "grad_norm": 0.13251517713069916, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3328 + }, + { + "epoch": 2.6589456869009584, + "grad_norm": 0.09462655335664749, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3329 + }, + { + "epoch": 2.659744408945687, + "grad_norm": 0.04496161639690399, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3330 + }, + { + "epoch": 2.6605431309904155, + "grad_norm": 0.13246162235736847, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3331 + }, + { + "epoch": 2.661341853035144, + "grad_norm": 0.1548391878604889, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3332 + }, + { + "epoch": 2.662140575079872, + "grad_norm": 0.09438800066709518, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3333 + }, + { + "epoch": 2.6629392971246006, + "grad_norm": 0.033411599695682526, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3334 + }, + { + "epoch": 2.663738019169329, + "grad_norm": 0.04015564173460007, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3335 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 0.033046361058950424, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3336 + }, + { + "epoch": 2.665335463258786, + "grad_norm": 0.04766019433736801, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3337 + }, + { + "epoch": 2.6661341853035143, + "grad_norm": 0.06365641951560974, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3338 + }, + { + "epoch": 2.666932907348243, + "grad_norm": 0.03329809010028839, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3339 + }, + { + "epoch": 2.6677316293929714, + "grad_norm": 0.10063061863183975, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3340 + }, + { + "epoch": 2.6685303514377, + "grad_norm": 0.16541579365730286, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 3341 + }, + { + "epoch": 2.669329073482428, + "grad_norm": 0.18877379596233368, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3342 + }, + { + "epoch": 2.6701277955271565, + "grad_norm": 0.12577234208583832, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3343 + }, + { + "epoch": 2.670926517571885, + "grad_norm": 0.04403039440512657, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3344 + }, + { + "epoch": 2.6717252396166136, + "grad_norm": 0.172403946518898, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3345 + }, + { + "epoch": 2.6725239616613417, + "grad_norm": 0.2147791087627411, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3346 + }, + { + "epoch": 2.67332268370607, + "grad_norm": 0.1536005735397339, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3347 + }, + { + "epoch": 2.6741214057507987, + "grad_norm": 0.061038631945848465, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3348 + }, + { + "epoch": 2.6749201277955272, + "grad_norm": 0.03402748703956604, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3349 + }, + { + "epoch": 2.6757188498402558, + "grad_norm": 0.05285736918449402, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3350 + }, + { + "epoch": 2.6765175718849843, + "grad_norm": 0.0807662233710289, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3351 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.057097889482975006, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3352 + }, + { + "epoch": 2.678115015974441, + "grad_norm": 0.06845760345458984, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3353 + }, + { + "epoch": 2.6789137380191694, + "grad_norm": 0.1209796816110611, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3354 + }, + { + "epoch": 2.6797124600638975, + "grad_norm": 0.09372428804636002, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3355 + }, + { + "epoch": 2.680511182108626, + "grad_norm": 0.03795485943555832, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3356 + }, + { + "epoch": 2.6813099041533546, + "grad_norm": 0.14420334994792938, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3357 + }, + { + "epoch": 2.682108626198083, + "grad_norm": 0.23049019277095795, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3358 + }, + { + "epoch": 2.6829073482428116, + "grad_norm": 0.21722057461738586, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3359 + }, + { + "epoch": 2.68370607028754, + "grad_norm": 0.0968366488814354, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3360 + }, + { + "epoch": 2.6845047923322682, + "grad_norm": 0.10279416292905807, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3361 + }, + { + "epoch": 2.6853035143769968, + "grad_norm": 0.2077404409646988, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3362 + }, + { + "epoch": 2.6861022364217253, + "grad_norm": 0.14186711609363556, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3363 + }, + { + "epoch": 2.686900958466454, + "grad_norm": 0.04573604837059975, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3364 + }, + { + "epoch": 2.687699680511182, + "grad_norm": 0.13861627876758575, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3365 + }, + { + "epoch": 2.6884984025559104, + "grad_norm": 0.17746120691299438, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3366 + }, + { + "epoch": 2.689297124600639, + "grad_norm": 0.15865683555603027, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3367 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.05537402629852295, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3368 + }, + { + "epoch": 2.690894568690096, + "grad_norm": 0.064423106610775, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3369 + }, + { + "epoch": 2.6916932907348246, + "grad_norm": 0.0922585278749466, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3370 + }, + { + "epoch": 2.6924920127795526, + "grad_norm": 0.08034171909093857, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3371 + }, + { + "epoch": 2.693290734824281, + "grad_norm": 0.05695292726159096, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3372 + }, + { + "epoch": 2.6940894568690097, + "grad_norm": 0.04140406847000122, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3373 + }, + { + "epoch": 2.6948881789137378, + "grad_norm": 0.038130711764097214, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3374 + }, + { + "epoch": 2.6956869009584663, + "grad_norm": 0.07363594323396683, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3375 + }, + { + "epoch": 2.696485623003195, + "grad_norm": 0.13670513033866882, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3376 + }, + { + "epoch": 2.6972843450479234, + "grad_norm": 0.16614536941051483, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3377 + }, + { + "epoch": 2.698083067092652, + "grad_norm": 0.1346762478351593, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3378 + }, + { + "epoch": 2.6988817891373804, + "grad_norm": 0.06321856379508972, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3379 + }, + { + "epoch": 2.6996805111821085, + "grad_norm": 0.057517897337675095, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3380 + }, + { + "epoch": 2.700479233226837, + "grad_norm": 0.11995001137256622, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3381 + }, + { + "epoch": 2.7012779552715656, + "grad_norm": 0.10514877736568451, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3382 + }, + { + "epoch": 2.702076677316294, + "grad_norm": 0.05942686274647713, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3383 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 0.03508206829428673, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3384 + }, + { + "epoch": 2.7036741214057507, + "grad_norm": 0.05182692036032677, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3385 + }, + { + "epoch": 2.7044728434504792, + "grad_norm": 0.0597345344722271, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3386 + }, + { + "epoch": 2.7052715654952078, + "grad_norm": 0.037486087530851364, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3387 + }, + { + "epoch": 2.7060702875399363, + "grad_norm": 0.040483538061380386, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 3388 + }, + { + "epoch": 2.706869009584665, + "grad_norm": 0.044094670563936234, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3389 + }, + { + "epoch": 2.707667731629393, + "grad_norm": 0.06498228758573532, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3390 + }, + { + "epoch": 2.7084664536741214, + "grad_norm": 0.06955298781394958, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3391 + }, + { + "epoch": 2.70926517571885, + "grad_norm": 0.11691966652870178, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3392 + }, + { + "epoch": 2.710063897763578, + "grad_norm": 0.1183234304189682, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3393 + }, + { + "epoch": 2.7108626198083066, + "grad_norm": 0.08358792215585709, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3394 + }, + { + "epoch": 2.711661341853035, + "grad_norm": 0.04190056398510933, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3395 + }, + { + "epoch": 2.7124600638977636, + "grad_norm": 0.09757649153470993, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3396 + }, + { + "epoch": 2.713258785942492, + "grad_norm": 0.11508934944868088, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 3397 + }, + { + "epoch": 2.7140575079872207, + "grad_norm": 0.05612087994813919, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3398 + }, + { + "epoch": 2.7148562300319488, + "grad_norm": 0.07044408470392227, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3399 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.07732822746038437, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3400 + }, + { + "epoch": 2.716453674121406, + "grad_norm": 0.054326847195625305, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3401 + }, + { + "epoch": 2.7172523961661343, + "grad_norm": 0.041327398270368576, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3402 + }, + { + "epoch": 2.7180511182108624, + "grad_norm": 0.07147548347711563, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3403 + }, + { + "epoch": 2.718849840255591, + "grad_norm": 0.12999942898750305, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3404 + }, + { + "epoch": 2.7196485623003195, + "grad_norm": 0.18404515087604523, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3405 + }, + { + "epoch": 2.720447284345048, + "grad_norm": 0.1873377114534378, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3406 + }, + { + "epoch": 2.7212460063897765, + "grad_norm": 0.0732024610042572, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3407 + }, + { + "epoch": 2.722044728434505, + "grad_norm": 0.07602795958518982, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3408 + }, + { + "epoch": 2.722843450479233, + "grad_norm": 0.07871323823928833, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3409 + }, + { + "epoch": 2.7236421725239617, + "grad_norm": 0.0738302692770958, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3410 + }, + { + "epoch": 2.72444089456869, + "grad_norm": 0.12097286432981491, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3411 + }, + { + "epoch": 2.7252396166134183, + "grad_norm": 0.10136821120977402, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3412 + }, + { + "epoch": 2.726038338658147, + "grad_norm": 0.07281512022018433, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3413 + }, + { + "epoch": 2.7268370607028753, + "grad_norm": 0.09425969421863556, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3414 + }, + { + "epoch": 2.727635782747604, + "grad_norm": 0.11939436942338943, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3415 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 0.07181181758642197, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3416 + }, + { + "epoch": 2.729233226837061, + "grad_norm": 0.06634730845689774, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3417 + }, + { + "epoch": 2.730031948881789, + "grad_norm": 0.0941692590713501, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3418 + }, + { + "epoch": 2.7308306709265175, + "grad_norm": 0.10803452879190445, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3419 + }, + { + "epoch": 2.731629392971246, + "grad_norm": 0.08289305865764618, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3420 + }, + { + "epoch": 2.7324281150159746, + "grad_norm": 0.048421960324048996, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3421 + }, + { + "epoch": 2.7332268370607027, + "grad_norm": 0.09108635783195496, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3422 + }, + { + "epoch": 2.734025559105431, + "grad_norm": 0.13627508282661438, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3423 + }, + { + "epoch": 2.7348242811501597, + "grad_norm": 0.14651858806610107, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3424 + }, + { + "epoch": 2.7356230031948883, + "grad_norm": 0.126741424202919, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3425 + }, + { + "epoch": 2.736421725239617, + "grad_norm": 0.05885545164346695, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3426 + }, + { + "epoch": 2.737220447284345, + "grad_norm": 0.09471739828586578, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3427 + }, + { + "epoch": 2.7380191693290734, + "grad_norm": 0.18026123940944672, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3428 + }, + { + "epoch": 2.738817891373802, + "grad_norm": 0.1737871915102005, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3429 + }, + { + "epoch": 2.7396166134185305, + "grad_norm": 0.052994512021541595, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3430 + }, + { + "epoch": 2.7404153354632586, + "grad_norm": 0.13484452664852142, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 3431 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 0.2207227200269699, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3432 + }, + { + "epoch": 2.7420127795527156, + "grad_norm": 0.17741963267326355, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3433 + }, + { + "epoch": 2.742811501597444, + "grad_norm": 0.07451824843883514, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3434 + }, + { + "epoch": 2.7436102236421727, + "grad_norm": 0.07947403192520142, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3435 + }, + { + "epoch": 2.744408945686901, + "grad_norm": 0.11197762936353683, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3436 + }, + { + "epoch": 2.7452076677316293, + "grad_norm": 0.08398377895355225, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 3437 + }, + { + "epoch": 2.746006389776358, + "grad_norm": 0.03809420019388199, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3438 + }, + { + "epoch": 2.7468051118210863, + "grad_norm": 0.11537694931030273, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3439 + }, + { + "epoch": 2.747603833865815, + "grad_norm": 0.1537221372127533, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3440 + }, + { + "epoch": 2.748402555910543, + "grad_norm": 0.1132403165102005, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3441 + }, + { + "epoch": 2.7492012779552715, + "grad_norm": 0.038440920412540436, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3442 + }, + { + "epoch": 2.75, + "grad_norm": 0.10132595151662827, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3443 + }, + { + "epoch": 2.7507987220447285, + "grad_norm": 0.12446253001689911, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3444 + }, + { + "epoch": 2.751597444089457, + "grad_norm": 0.05364474281668663, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3445 + }, + { + "epoch": 2.752396166134185, + "grad_norm": 0.04705234244465828, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3446 + }, + { + "epoch": 2.7531948881789137, + "grad_norm": 0.10524975508451462, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3447 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 0.12036000937223434, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3448 + }, + { + "epoch": 2.7547923322683707, + "grad_norm": 0.08042819797992706, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3449 + }, + { + "epoch": 2.755591054313099, + "grad_norm": 0.04404102638363838, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3450 + }, + { + "epoch": 2.7563897763578273, + "grad_norm": 0.0766257792711258, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3451 + }, + { + "epoch": 2.757188498402556, + "grad_norm": 0.06359248608350754, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3452 + }, + { + "epoch": 2.7579872204472844, + "grad_norm": 0.06752901524305344, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3453 + }, + { + "epoch": 2.758785942492013, + "grad_norm": 0.12018375843763351, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3454 + }, + { + "epoch": 2.7595846645367414, + "grad_norm": 0.15904727578163147, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3455 + }, + { + "epoch": 2.7603833865814695, + "grad_norm": 0.12665021419525146, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3456 + }, + { + "epoch": 2.761182108626198, + "grad_norm": 0.07552342861890793, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3457 + }, + { + "epoch": 2.7619808306709266, + "grad_norm": 0.25927653908729553, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3458 + }, + { + "epoch": 2.762779552715655, + "grad_norm": 0.3487590253353119, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3459 + }, + { + "epoch": 2.763578274760383, + "grad_norm": 0.2783665359020233, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3460 + }, + { + "epoch": 2.7643769968051117, + "grad_norm": 0.054424334317445755, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3461 + }, + { + "epoch": 2.7651757188498403, + "grad_norm": 0.240921288728714, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3462 + }, + { + "epoch": 2.765974440894569, + "grad_norm": 0.3380962014198303, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3463 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.1514623463153839, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3464 + }, + { + "epoch": 2.7675718849840254, + "grad_norm": 0.15135464072227478, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3465 + }, + { + "epoch": 2.768370607028754, + "grad_norm": 0.262546181678772, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3466 + }, + { + "epoch": 2.7691693290734825, + "grad_norm": 0.11052273958921432, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3467 + }, + { + "epoch": 2.769968051118211, + "grad_norm": 0.14473804831504822, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3468 + }, + { + "epoch": 2.770766773162939, + "grad_norm": 0.24968142807483673, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3469 + }, + { + "epoch": 2.7715654952076676, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3470 + }, + { + "epoch": 2.772364217252396, + "grad_norm": 0.0957072302699089, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3471 + }, + { + "epoch": 2.7731629392971247, + "grad_norm": 0.2122000902891159, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3472 + }, + { + "epoch": 2.773961661341853, + "grad_norm": 0.15716226398944855, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3473 + }, + { + "epoch": 2.7747603833865817, + "grad_norm": 0.05107169970870018, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3474 + }, + { + "epoch": 2.77555910543131, + "grad_norm": 0.19824674725532532, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3475 + }, + { + "epoch": 2.7763578274760383, + "grad_norm": 0.16866235435009003, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3476 + }, + { + "epoch": 2.777156549520767, + "grad_norm": 0.03332412987947464, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3477 + }, + { + "epoch": 2.777955271565495, + "grad_norm": 0.1771237850189209, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3478 + }, + { + "epoch": 2.7787539936102235, + "grad_norm": 0.23501509428024292, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3479 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.0976579561829567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3480 + }, + { + "epoch": 2.7803514376996805, + "grad_norm": 0.11640458554029465, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3481 + }, + { + "epoch": 2.781150159744409, + "grad_norm": 0.2140960842370987, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3482 + }, + { + "epoch": 2.7819488817891376, + "grad_norm": 0.2055736929178238, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3483 + }, + { + "epoch": 2.7827476038338657, + "grad_norm": 0.09386937320232391, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 3484 + }, + { + "epoch": 2.783546325878594, + "grad_norm": 0.11534380912780762, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3485 + }, + { + "epoch": 2.7843450479233227, + "grad_norm": 0.19186711311340332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3486 + }, + { + "epoch": 2.7851437699680512, + "grad_norm": 0.26858124136924744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3487 + }, + { + "epoch": 2.7859424920127793, + "grad_norm": 0.05965370684862137, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 3488 + }, + { + "epoch": 2.786741214057508, + "grad_norm": 0.17804528772830963, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3489 + }, + { + "epoch": 2.7875399361022364, + "grad_norm": 0.1802065223455429, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 3490 + }, + { + "epoch": 2.788338658146965, + "grad_norm": 0.06634502857923508, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3491 + }, + { + "epoch": 2.7891373801916934, + "grad_norm": 0.06682102382183075, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3492 + }, + { + "epoch": 2.789936102236422, + "grad_norm": 0.08941584080457687, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3493 + }, + { + "epoch": 2.79073482428115, + "grad_norm": 0.06336037069559097, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3494 + }, + { + "epoch": 2.7915335463258786, + "grad_norm": 0.05562690272927284, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3495 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.10294149816036224, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3496 + }, + { + "epoch": 2.793130990415335, + "grad_norm": 0.11363442987203598, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3497 + }, + { + "epoch": 2.7939297124600637, + "grad_norm": 0.05790446698665619, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3498 + }, + { + "epoch": 2.7947284345047922, + "grad_norm": 0.09351370483636856, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3499 + }, + { + "epoch": 2.7955271565495208, + "grad_norm": 0.2225412130355835, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3500 + }, + { + "epoch": 2.7963258785942493, + "grad_norm": 0.21828165650367737, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3501 + }, + { + "epoch": 2.797124600638978, + "grad_norm": 0.06987733393907547, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 3502 + }, + { + "epoch": 2.797923322683706, + "grad_norm": 0.14518103003501892, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3503 + }, + { + "epoch": 2.7987220447284344, + "grad_norm": 0.24233761429786682, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3504 + }, + { + "epoch": 2.799520766773163, + "grad_norm": 0.19286365807056427, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3505 + }, + { + "epoch": 2.8003194888178915, + "grad_norm": 0.07981286942958832, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3506 + }, + { + "epoch": 2.8011182108626196, + "grad_norm": 0.050319187343120575, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3507 + }, + { + "epoch": 2.801916932907348, + "grad_norm": 0.09955406934022903, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3508 + }, + { + "epoch": 2.8027156549520766, + "grad_norm": 0.048427898436784744, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3509 + }, + { + "epoch": 2.803514376996805, + "grad_norm": 0.0805777907371521, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3510 + }, + { + "epoch": 2.8043130990415337, + "grad_norm": 0.07289621978998184, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3511 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.04940955713391304, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3512 + }, + { + "epoch": 2.8059105431309903, + "grad_norm": 0.07228294759988785, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3513 + }, + { + "epoch": 2.806709265175719, + "grad_norm": 0.06902103871107101, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3514 + }, + { + "epoch": 2.8075079872204474, + "grad_norm": 0.056301236152648926, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3515 + }, + { + "epoch": 2.8083067092651754, + "grad_norm": 0.03880859166383743, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3516 + }, + { + "epoch": 2.809105431309904, + "grad_norm": 0.04914811998605728, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3517 + }, + { + "epoch": 2.8099041533546325, + "grad_norm": 0.04139270633459091, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3518 + }, + { + "epoch": 2.810702875399361, + "grad_norm": 0.05118592828512192, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3519 + }, + { + "epoch": 2.8115015974440896, + "grad_norm": 0.03548616170883179, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 3520 + }, + { + "epoch": 2.812300319488818, + "grad_norm": 0.04883241280913353, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3521 + }, + { + "epoch": 2.813099041533546, + "grad_norm": 0.044492170214653015, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3522 + }, + { + "epoch": 2.8138977635782747, + "grad_norm": 0.050978366285562515, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3523 + }, + { + "epoch": 2.8146964856230032, + "grad_norm": 0.04663826525211334, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3524 + }, + { + "epoch": 2.8154952076677318, + "grad_norm": 0.06378154456615448, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3525 + }, + { + "epoch": 2.81629392971246, + "grad_norm": 0.06913618743419647, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3526 + }, + { + "epoch": 2.8170926517571884, + "grad_norm": 0.084662064909935, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3527 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 0.08352439105510712, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3528 + }, + { + "epoch": 2.8186900958466454, + "grad_norm": 0.07254189252853394, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3529 + }, + { + "epoch": 2.819488817891374, + "grad_norm": 0.04416285827755928, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 3530 + }, + { + "epoch": 2.8202875399361025, + "grad_norm": 0.056230951100587845, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3531 + }, + { + "epoch": 2.8210862619808306, + "grad_norm": 0.11055732518434525, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3532 + }, + { + "epoch": 2.821884984025559, + "grad_norm": 0.08660246431827545, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3533 + }, + { + "epoch": 2.8226837060702876, + "grad_norm": 0.0691947191953659, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3534 + }, + { + "epoch": 2.8234824281150157, + "grad_norm": 0.09254545718431473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3535 + }, + { + "epoch": 2.8242811501597442, + "grad_norm": 0.0663340613245964, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3536 + }, + { + "epoch": 2.8250798722044728, + "grad_norm": 0.05052514374256134, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3537 + }, + { + "epoch": 2.8258785942492013, + "grad_norm": 0.08364969491958618, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3538 + }, + { + "epoch": 2.82667731629393, + "grad_norm": 0.08269570767879486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3539 + }, + { + "epoch": 2.8274760383386583, + "grad_norm": 0.06289245933294296, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3540 + }, + { + "epoch": 2.8282747603833864, + "grad_norm": 0.03565627336502075, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3541 + }, + { + "epoch": 2.829073482428115, + "grad_norm": 0.057896651327610016, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3542 + }, + { + "epoch": 2.8298722044728435, + "grad_norm": 0.046379514038562775, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3543 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.06231336295604706, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3544 + }, + { + "epoch": 2.8314696485623, + "grad_norm": 0.03983502462506294, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3545 + }, + { + "epoch": 2.8322683706070286, + "grad_norm": 0.07364759594202042, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3546 + }, + { + "epoch": 2.833067092651757, + "grad_norm": 0.11596816778182983, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3547 + }, + { + "epoch": 2.8338658146964857, + "grad_norm": 0.10731378942728043, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3548 + }, + { + "epoch": 2.834664536741214, + "grad_norm": 0.06365050375461578, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3549 + }, + { + "epoch": 2.8354632587859427, + "grad_norm": 0.055451441556215286, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3550 + }, + { + "epoch": 2.836261980830671, + "grad_norm": 0.1490558534860611, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3551 + }, + { + "epoch": 2.8370607028753994, + "grad_norm": 0.1539796143770218, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3552 + }, + { + "epoch": 2.837859424920128, + "grad_norm": 0.06760501861572266, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3553 + }, + { + "epoch": 2.838658146964856, + "grad_norm": 0.0685611367225647, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 3554 + }, + { + "epoch": 2.8394568690095845, + "grad_norm": 0.14234358072280884, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3555 + }, + { + "epoch": 2.840255591054313, + "grad_norm": 0.14428865909576416, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3556 + }, + { + "epoch": 2.8410543130990416, + "grad_norm": 0.07594695687294006, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3557 + }, + { + "epoch": 2.84185303514377, + "grad_norm": 0.040841538459062576, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3558 + }, + { + "epoch": 2.8426517571884986, + "grad_norm": 0.04991824924945831, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3559 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 0.03846943378448486, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3560 + }, + { + "epoch": 2.844249201277955, + "grad_norm": 0.04851507395505905, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3561 + }, + { + "epoch": 2.8450479233226837, + "grad_norm": 0.0635538399219513, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3562 + }, + { + "epoch": 2.8458466453674123, + "grad_norm": 0.11812663078308105, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3563 + }, + { + "epoch": 2.8466453674121404, + "grad_norm": 0.05664098262786865, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3564 + }, + { + "epoch": 2.847444089456869, + "grad_norm": 0.03532585874199867, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3565 + }, + { + "epoch": 2.8482428115015974, + "grad_norm": 0.06758403033018112, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 3566 + }, + { + "epoch": 2.849041533546326, + "grad_norm": 0.06279300898313522, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3567 + }, + { + "epoch": 2.8498402555910545, + "grad_norm": 0.043967198580503464, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3568 + }, + { + "epoch": 2.850638977635783, + "grad_norm": 0.04900701716542244, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 3569 + }, + { + "epoch": 2.851437699680511, + "grad_norm": 0.07339311391115189, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3570 + }, + { + "epoch": 2.8522364217252396, + "grad_norm": 0.10644743591547012, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3571 + }, + { + "epoch": 2.853035143769968, + "grad_norm": 0.10544353723526001, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3572 + }, + { + "epoch": 2.8538338658146962, + "grad_norm": 0.0590951181948185, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3573 + }, + { + "epoch": 2.8546325878594248, + "grad_norm": 0.05038939788937569, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3574 + }, + { + "epoch": 2.8554313099041533, + "grad_norm": 0.06013040617108345, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3575 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.07330521196126938, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3576 + }, + { + "epoch": 2.8570287539936103, + "grad_norm": 0.12049853056669235, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3577 + }, + { + "epoch": 2.857827476038339, + "grad_norm": 0.13056780397891998, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3578 + }, + { + "epoch": 2.858626198083067, + "grad_norm": 0.12987029552459717, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3579 + }, + { + "epoch": 2.8594249201277955, + "grad_norm": 0.08681001514196396, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3580 + }, + { + "epoch": 2.860223642172524, + "grad_norm": 0.060947105288505554, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3581 + }, + { + "epoch": 2.8610223642172525, + "grad_norm": 0.10896368324756622, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3582 + }, + { + "epoch": 2.8618210862619806, + "grad_norm": 0.1251460760831833, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3583 + }, + { + "epoch": 2.862619808306709, + "grad_norm": 0.035174671560525894, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3584 + }, + { + "epoch": 2.8634185303514377, + "grad_norm": 0.12026303261518478, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3585 + }, + { + "epoch": 2.864217252396166, + "grad_norm": 0.16679063439369202, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3586 + }, + { + "epoch": 2.8650159744408947, + "grad_norm": 0.19229409098625183, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3587 + }, + { + "epoch": 2.8658146964856233, + "grad_norm": 0.17964699864387512, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3588 + }, + { + "epoch": 2.8666134185303513, + "grad_norm": 0.10671430081129074, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3589 + }, + { + "epoch": 2.86741214057508, + "grad_norm": 0.04453161358833313, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 3590 + }, + { + "epoch": 2.8682108626198084, + "grad_norm": 0.1531655639410019, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3591 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.19321779906749725, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3592 + }, + { + "epoch": 2.869808306709265, + "grad_norm": 0.19540782272815704, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3593 + }, + { + "epoch": 2.8706070287539935, + "grad_norm": 0.22210878133773804, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3594 + }, + { + "epoch": 2.871405750798722, + "grad_norm": 0.2089247703552246, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3595 + }, + { + "epoch": 2.8722044728434506, + "grad_norm": 0.11910446733236313, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3596 + }, + { + "epoch": 2.873003194888179, + "grad_norm": 0.05230247974395752, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3597 + }, + { + "epoch": 2.873801916932907, + "grad_norm": 0.09492263197898865, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3598 + }, + { + "epoch": 2.8746006389776357, + "grad_norm": 0.1396690160036087, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3599 + }, + { + "epoch": 2.8753993610223643, + "grad_norm": 0.12218718230724335, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3600 + }, + { + "epoch": 2.876198083067093, + "grad_norm": 0.05510007217526436, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 3601 + }, + { + "epoch": 2.876996805111821, + "grad_norm": 0.04949348792433739, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3602 + }, + { + "epoch": 2.8777955271565494, + "grad_norm": 0.06522537767887115, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3603 + }, + { + "epoch": 2.878594249201278, + "grad_norm": 0.034176018089056015, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3604 + }, + { + "epoch": 2.8793929712460065, + "grad_norm": 0.07579770684242249, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3605 + }, + { + "epoch": 2.880191693290735, + "grad_norm": 0.09512948244810104, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3606 + }, + { + "epoch": 2.8809904153354635, + "grad_norm": 0.059753213077783585, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3607 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.2461470365524292, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3608 + }, + { + "epoch": 2.88258785942492, + "grad_norm": 0.11298660188913345, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3609 + }, + { + "epoch": 2.8833865814696487, + "grad_norm": 0.20638997852802277, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3610 + }, + { + "epoch": 2.8841853035143767, + "grad_norm": 0.2394232600927353, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3611 + }, + { + "epoch": 2.8849840255591053, + "grad_norm": 0.15168963372707367, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3612 + }, + { + "epoch": 2.885782747603834, + "grad_norm": 0.03990825638175011, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3613 + }, + { + "epoch": 2.8865814696485623, + "grad_norm": 0.1725347936153412, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3614 + }, + { + "epoch": 2.887380191693291, + "grad_norm": 0.20821869373321533, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3615 + }, + { + "epoch": 2.8881789137380194, + "grad_norm": 0.14441269636154175, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3616 + }, + { + "epoch": 2.8889776357827475, + "grad_norm": 0.037162624299526215, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3617 + }, + { + "epoch": 2.889776357827476, + "grad_norm": 0.11550657451152802, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3618 + }, + { + "epoch": 2.8905750798722045, + "grad_norm": 0.15214277803897858, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3619 + }, + { + "epoch": 2.891373801916933, + "grad_norm": 0.09059946238994598, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3620 + }, + { + "epoch": 2.892172523961661, + "grad_norm": 0.03436599299311638, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3621 + }, + { + "epoch": 2.8929712460063897, + "grad_norm": 0.0839625746011734, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3622 + }, + { + "epoch": 2.893769968051118, + "grad_norm": 0.1618664264678955, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3623 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.08216597139835358, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3624 + }, + { + "epoch": 2.8953674121405752, + "grad_norm": 0.06303965300321579, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3625 + }, + { + "epoch": 2.8961661341853038, + "grad_norm": 0.050278183072805405, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3626 + }, + { + "epoch": 2.896964856230032, + "grad_norm": 0.04620242863893509, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3627 + }, + { + "epoch": 2.8977635782747604, + "grad_norm": 0.04937691614031792, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3628 + }, + { + "epoch": 2.898562300319489, + "grad_norm": 0.056928347796201706, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3629 + }, + { + "epoch": 2.899361022364217, + "grad_norm": 0.04932256042957306, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3630 + }, + { + "epoch": 2.9001597444089455, + "grad_norm": 0.04320303350687027, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3631 + }, + { + "epoch": 2.900958466453674, + "grad_norm": 0.08589868247509003, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3632 + }, + { + "epoch": 2.9017571884984026, + "grad_norm": 0.11458484083414078, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3633 + }, + { + "epoch": 2.902555910543131, + "grad_norm": 0.13549752533435822, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3634 + }, + { + "epoch": 2.9033546325878596, + "grad_norm": 0.1327086091041565, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3635 + }, + { + "epoch": 2.9041533546325877, + "grad_norm": 0.08295682817697525, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3636 + }, + { + "epoch": 2.9049520766773163, + "grad_norm": 0.05216526240110397, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3637 + }, + { + "epoch": 2.905750798722045, + "grad_norm": 0.11048691719770432, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3638 + }, + { + "epoch": 2.9065495207667733, + "grad_norm": 0.17681372165679932, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3639 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.16901300847530365, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3640 + }, + { + "epoch": 2.90814696485623, + "grad_norm": 0.10261020064353943, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3641 + }, + { + "epoch": 2.9089456869009584, + "grad_norm": 0.042478349059820175, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3642 + }, + { + "epoch": 2.909744408945687, + "grad_norm": 0.11727496981620789, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3643 + }, + { + "epoch": 2.9105431309904155, + "grad_norm": 0.14884977042675018, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3644 + }, + { + "epoch": 2.911341853035144, + "grad_norm": 0.047877270728349686, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3645 + }, + { + "epoch": 2.912140575079872, + "grad_norm": 0.11930714547634125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3646 + }, + { + "epoch": 2.9129392971246006, + "grad_norm": 0.1873956024646759, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3647 + }, + { + "epoch": 2.913738019169329, + "grad_norm": 0.22310249507427216, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3648 + }, + { + "epoch": 2.9145367412140573, + "grad_norm": 0.21259911358356476, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3649 + }, + { + "epoch": 2.915335463258786, + "grad_norm": 0.11584217846393585, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3650 + }, + { + "epoch": 2.9161341853035143, + "grad_norm": 0.04092720150947571, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3651 + }, + { + "epoch": 2.916932907348243, + "grad_norm": 0.14542047679424286, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3652 + }, + { + "epoch": 2.9177316293929714, + "grad_norm": 0.16328515112400055, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3653 + }, + { + "epoch": 2.9185303514377, + "grad_norm": 0.11284583806991577, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3654 + }, + { + "epoch": 2.919329073482428, + "grad_norm": 0.03723357245326042, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3655 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.1347448229789734, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3656 + }, + { + "epoch": 2.920926517571885, + "grad_norm": 0.1697797328233719, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3657 + }, + { + "epoch": 2.9217252396166136, + "grad_norm": 0.12122484296560287, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3658 + }, + { + "epoch": 2.9225239616613417, + "grad_norm": 0.043503791093826294, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3659 + }, + { + "epoch": 2.92332268370607, + "grad_norm": 0.1600242555141449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3660 + }, + { + "epoch": 2.9241214057507987, + "grad_norm": 0.21065576374530792, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3661 + }, + { + "epoch": 2.9249201277955272, + "grad_norm": 0.16726253926753998, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3662 + }, + { + "epoch": 2.9257188498402558, + "grad_norm": 0.09178615361452103, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3663 + }, + { + "epoch": 2.9265175718849843, + "grad_norm": 0.0447201170027256, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3664 + }, + { + "epoch": 2.9273162939297124, + "grad_norm": 0.10462333261966705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3665 + }, + { + "epoch": 2.928115015974441, + "grad_norm": 0.08236772567033768, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3666 + }, + { + "epoch": 2.9289137380191694, + "grad_norm": 0.06551375985145569, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3667 + }, + { + "epoch": 2.9297124600638975, + "grad_norm": 0.1531982123851776, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3668 + }, + { + "epoch": 2.930511182108626, + "grad_norm": 0.19483166933059692, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3669 + }, + { + "epoch": 2.9313099041533546, + "grad_norm": 0.12347809225320816, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3670 + }, + { + "epoch": 2.932108626198083, + "grad_norm": 0.05494467169046402, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3671 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 0.2280847579240799, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3672 + }, + { + "epoch": 2.93370607028754, + "grad_norm": 0.30344241857528687, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3673 + }, + { + "epoch": 2.9345047923322682, + "grad_norm": 0.243449404835701, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3674 + }, + { + "epoch": 2.9353035143769968, + "grad_norm": 0.11542543768882751, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3675 + }, + { + "epoch": 2.9361022364217253, + "grad_norm": 0.09501481056213379, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3676 + }, + { + "epoch": 2.936900958466454, + "grad_norm": 0.2299363762140274, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3677 + }, + { + "epoch": 2.937699680511182, + "grad_norm": 0.15020152926445007, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3678 + }, + { + "epoch": 2.9384984025559104, + "grad_norm": 0.0655093789100647, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3679 + }, + { + "epoch": 2.939297124600639, + "grad_norm": 0.15242713689804077, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3680 + }, + { + "epoch": 2.9400958466453675, + "grad_norm": 0.13315139710903168, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3681 + }, + { + "epoch": 2.940894568690096, + "grad_norm": 0.05966462939977646, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3682 + }, + { + "epoch": 2.9416932907348246, + "grad_norm": 0.08146806806325912, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3683 + }, + { + "epoch": 2.9424920127795526, + "grad_norm": 0.13615436851978302, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3684 + }, + { + "epoch": 2.943290734824281, + "grad_norm": 0.10889092832803726, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3685 + }, + { + "epoch": 2.9440894568690097, + "grad_norm": 0.03455124795436859, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3686 + }, + { + "epoch": 2.9448881789137378, + "grad_norm": 0.07490532845258713, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3687 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.08072194457054138, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3688 + }, + { + "epoch": 2.946485623003195, + "grad_norm": 0.03630111739039421, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3689 + }, + { + "epoch": 2.9472843450479234, + "grad_norm": 0.09075939655303955, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3690 + }, + { + "epoch": 2.948083067092652, + "grad_norm": 0.1618475615978241, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3691 + }, + { + "epoch": 2.9488817891373804, + "grad_norm": 0.18354517221450806, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3692 + }, + { + "epoch": 2.9496805111821085, + "grad_norm": 0.170358344912529, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3693 + }, + { + "epoch": 2.950479233226837, + "grad_norm": 0.10800250619649887, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3694 + }, + { + "epoch": 2.9512779552715656, + "grad_norm": 0.03771398589015007, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3695 + }, + { + "epoch": 2.952076677316294, + "grad_norm": 0.07931157946586609, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3696 + }, + { + "epoch": 2.952875399361022, + "grad_norm": 0.08149557560682297, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3697 + }, + { + "epoch": 2.9536741214057507, + "grad_norm": 0.05122899264097214, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3698 + }, + { + "epoch": 2.9544728434504792, + "grad_norm": 0.040845707058906555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3699 + }, + { + "epoch": 2.9552715654952078, + "grad_norm": 0.11444225907325745, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3700 + }, + { + "epoch": 2.9560702875399363, + "grad_norm": 0.20140959322452545, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3701 + }, + { + "epoch": 2.956869009584665, + "grad_norm": 0.24982111155986786, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3702 + }, + { + "epoch": 2.957667731629393, + "grad_norm": 0.21290510892868042, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3703 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.11526014655828476, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3704 + }, + { + "epoch": 2.95926517571885, + "grad_norm": 0.03769242390990257, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3705 + }, + { + "epoch": 2.960063897763578, + "grad_norm": 0.091837577521801, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3706 + }, + { + "epoch": 2.9608626198083066, + "grad_norm": 0.0956759825348854, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3707 + }, + { + "epoch": 2.961661341853035, + "grad_norm": 0.06945781409740448, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3708 + }, + { + "epoch": 2.9624600638977636, + "grad_norm": 0.03904029354453087, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3709 + }, + { + "epoch": 2.963258785942492, + "grad_norm": 0.1264238953590393, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3710 + }, + { + "epoch": 2.9640575079872207, + "grad_norm": 0.1689605861902237, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3711 + }, + { + "epoch": 2.9648562300319488, + "grad_norm": 0.15059368312358856, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3712 + }, + { + "epoch": 2.9656549520766773, + "grad_norm": 0.12976346909999847, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3713 + }, + { + "epoch": 2.966453674121406, + "grad_norm": 0.08460741490125656, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3714 + }, + { + "epoch": 2.9672523961661343, + "grad_norm": 0.04914790764451027, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3715 + }, + { + "epoch": 2.9680511182108624, + "grad_norm": 0.09629235416650772, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3716 + }, + { + "epoch": 2.968849840255591, + "grad_norm": 0.0895731970667839, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 3717 + }, + { + "epoch": 2.9696485623003195, + "grad_norm": 0.039528124034404755, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3718 + }, + { + "epoch": 2.970447284345048, + "grad_norm": 0.12843455374240875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3719 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.1754530519247055, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3720 + }, + { + "epoch": 2.972044728434505, + "grad_norm": 0.14169782400131226, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3721 + }, + { + "epoch": 2.972843450479233, + "grad_norm": 0.04416975378990173, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3722 + }, + { + "epoch": 2.9736421725239617, + "grad_norm": 0.1259031444787979, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3723 + }, + { + "epoch": 2.97444089456869, + "grad_norm": 0.17667949199676514, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3724 + }, + { + "epoch": 2.9752396166134183, + "grad_norm": 0.1213974729180336, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3725 + }, + { + "epoch": 2.976038338658147, + "grad_norm": 0.052554335445165634, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3726 + }, + { + "epoch": 2.9768370607028753, + "grad_norm": 0.13257208466529846, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3727 + }, + { + "epoch": 2.977635782747604, + "grad_norm": 0.1463504135608673, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3728 + }, + { + "epoch": 2.9784345047923324, + "grad_norm": 0.08546306937932968, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3729 + }, + { + "epoch": 2.979233226837061, + "grad_norm": 0.04226094111800194, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3730 + }, + { + "epoch": 2.980031948881789, + "grad_norm": 0.0924859419465065, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3731 + }, + { + "epoch": 2.9808306709265175, + "grad_norm": 0.1094423234462738, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3732 + }, + { + "epoch": 2.981629392971246, + "grad_norm": 0.11132006347179413, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3733 + }, + { + "epoch": 2.9824281150159746, + "grad_norm": 0.11010250449180603, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3734 + }, + { + "epoch": 2.9832268370607027, + "grad_norm": 0.10370460152626038, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3735 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 0.08460240811109543, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3736 + }, + { + "epoch": 2.9848242811501597, + "grad_norm": 0.06218400225043297, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3737 + }, + { + "epoch": 2.9856230031948883, + "grad_norm": 0.07446395605802536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3738 + }, + { + "epoch": 2.986421725239617, + "grad_norm": 0.06072726845741272, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3739 + }, + { + "epoch": 2.987220447284345, + "grad_norm": 0.07607559114694595, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3740 + }, + { + "epoch": 2.9880191693290734, + "grad_norm": 0.151380717754364, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3741 + }, + { + "epoch": 2.988817891373802, + "grad_norm": 0.24132277071475983, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3742 + }, + { + "epoch": 2.9896166134185305, + "grad_norm": 0.2346547245979309, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3743 + }, + { + "epoch": 2.9904153354632586, + "grad_norm": 0.090092234313488, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3744 + }, + { + "epoch": 2.991214057507987, + "grad_norm": 0.10230003297328949, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3745 + }, + { + "epoch": 2.9920127795527156, + "grad_norm": 0.17678654193878174, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3746 + }, + { + "epoch": 2.992811501597444, + "grad_norm": 0.16382110118865967, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3747 + }, + { + "epoch": 2.9936102236421727, + "grad_norm": 0.06456442922353745, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3748 + }, + { + "epoch": 2.994408945686901, + "grad_norm": 0.1774967759847641, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3749 + }, + { + "epoch": 2.9952076677316293, + "grad_norm": 0.19274447858333588, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3750 + }, + { + "epoch": 2.996006389776358, + "grad_norm": 0.10767998546361923, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3751 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.07864238321781158, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3752 + }, + { + "epoch": 2.997603833865815, + "grad_norm": 0.21339190006256104, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3753 + }, + { + "epoch": 2.998402555910543, + "grad_norm": 0.2560347616672516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3754 + }, + { + "epoch": 2.9992012779552715, + "grad_norm": 0.15730907022953033, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3755 + }, + { + "epoch": 3.0, + "grad_norm": 0.09766457974910736, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3756 + }, + { + "epoch": 3.0007987220447285, + "grad_norm": 0.24393433332443237, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3757 + }, + { + "epoch": 3.001597444089457, + "grad_norm": 0.17650263011455536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3758 + }, + { + "epoch": 3.002396166134185, + "grad_norm": 0.06490518152713776, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3759 + }, + { + "epoch": 3.0031948881789137, + "grad_norm": 0.10893388092517853, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3760 + }, + { + "epoch": 3.003993610223642, + "grad_norm": 0.13606922328472137, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3761 + }, + { + "epoch": 3.0047923322683707, + "grad_norm": 0.07880546152591705, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3762 + }, + { + "epoch": 3.0055910543130993, + "grad_norm": 0.04203686863183975, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3763 + }, + { + "epoch": 3.0063897763578273, + "grad_norm": 0.07509997487068176, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3764 + }, + { + "epoch": 3.007188498402556, + "grad_norm": 0.08529910445213318, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3765 + }, + { + "epoch": 3.0079872204472844, + "grad_norm": 0.05542825534939766, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3766 + }, + { + "epoch": 3.008785942492013, + "grad_norm": 0.08245155215263367, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3767 + }, + { + "epoch": 3.009584664536741, + "grad_norm": 0.09580255299806595, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3768 + }, + { + "epoch": 3.0103833865814695, + "grad_norm": 0.08233854174613953, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3769 + }, + { + "epoch": 3.011182108626198, + "grad_norm": 0.0589553639292717, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3770 + }, + { + "epoch": 3.0119808306709266, + "grad_norm": 0.09862494468688965, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3771 + }, + { + "epoch": 3.012779552715655, + "grad_norm": 0.1471278816461563, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3772 + }, + { + "epoch": 3.013578274760383, + "grad_norm": 0.1422986537218094, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3773 + }, + { + "epoch": 3.0143769968051117, + "grad_norm": 0.06627846509218216, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3774 + }, + { + "epoch": 3.0151757188498403, + "grad_norm": 0.04936077445745468, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3775 + }, + { + "epoch": 3.015974440894569, + "grad_norm": 0.0745953619480133, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3776 + }, + { + "epoch": 3.0167731629392973, + "grad_norm": 0.0725102499127388, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3777 + }, + { + "epoch": 3.0175718849840254, + "grad_norm": 0.04181717708706856, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3778 + }, + { + "epoch": 3.018370607028754, + "grad_norm": 0.09955357760190964, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3779 + }, + { + "epoch": 3.0191693290734825, + "grad_norm": 0.21014735102653503, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3780 + }, + { + "epoch": 3.019968051118211, + "grad_norm": 0.30597689747810364, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3781 + }, + { + "epoch": 3.0207667731629395, + "grad_norm": 0.2930602431297302, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3782 + }, + { + "epoch": 3.0215654952076676, + "grad_norm": 0.1190100908279419, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3783 + }, + { + "epoch": 3.022364217252396, + "grad_norm": 0.0655524879693985, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3784 + }, + { + "epoch": 3.0231629392971247, + "grad_norm": 0.12062554061412811, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3785 + }, + { + "epoch": 3.023961661341853, + "grad_norm": 0.09680327773094177, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3786 + }, + { + "epoch": 3.0247603833865813, + "grad_norm": 0.0555860660970211, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3787 + }, + { + "epoch": 3.02555910543131, + "grad_norm": 0.1271962672472, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3788 + }, + { + "epoch": 3.0263578274760383, + "grad_norm": 0.12178758531808853, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3789 + }, + { + "epoch": 3.027156549520767, + "grad_norm": 0.09623143821954727, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3790 + }, + { + "epoch": 3.0279552715654954, + "grad_norm": 0.04004101827740669, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3791 + }, + { + "epoch": 3.0287539936102235, + "grad_norm": 0.14001014828681946, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3792 + }, + { + "epoch": 3.029552715654952, + "grad_norm": 0.24241770803928375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3793 + }, + { + "epoch": 3.0303514376996805, + "grad_norm": 0.29141902923583984, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3794 + }, + { + "epoch": 3.031150159744409, + "grad_norm": 0.22814971208572388, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 3795 + }, + { + "epoch": 3.0319488817891376, + "grad_norm": 0.08114828914403915, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3796 + }, + { + "epoch": 3.0327476038338657, + "grad_norm": 0.08104736357927322, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3797 + }, + { + "epoch": 3.033546325878594, + "grad_norm": 0.12007702887058258, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3798 + }, + { + "epoch": 3.0343450479233227, + "grad_norm": 0.06497872620820999, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3799 + }, + { + "epoch": 3.0351437699680512, + "grad_norm": 0.07407233864068985, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3800 + }, + { + "epoch": 3.0359424920127798, + "grad_norm": 0.16386932134628296, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3801 + }, + { + "epoch": 3.036741214057508, + "grad_norm": 0.21633599698543549, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3802 + }, + { + "epoch": 3.0375399361022364, + "grad_norm": 0.19224147498607635, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3803 + }, + { + "epoch": 3.038338658146965, + "grad_norm": 0.04962728172540665, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3804 + }, + { + "epoch": 3.0391373801916934, + "grad_norm": 0.17984353005886078, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3805 + }, + { + "epoch": 3.0399361022364215, + "grad_norm": 0.31483346223831177, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3806 + }, + { + "epoch": 3.04073482428115, + "grad_norm": 0.27175095677375793, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3807 + }, + { + "epoch": 3.0415335463258786, + "grad_norm": 0.06302175670862198, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3808 + }, + { + "epoch": 3.042332268370607, + "grad_norm": 0.18620255589485168, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3809 + }, + { + "epoch": 3.0431309904153356, + "grad_norm": 0.23254868388175964, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3810 + }, + { + "epoch": 3.0439297124600637, + "grad_norm": 0.08763844519853592, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3811 + }, + { + "epoch": 3.0447284345047922, + "grad_norm": 0.13173392415046692, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3812 + }, + { + "epoch": 3.0455271565495208, + "grad_norm": 0.24171577394008636, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3813 + }, + { + "epoch": 3.0463258785942493, + "grad_norm": 0.17649634182453156, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3814 + }, + { + "epoch": 3.047124600638978, + "grad_norm": 0.03800780326128006, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3815 + }, + { + "epoch": 3.047923322683706, + "grad_norm": 0.20039476454257965, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3816 + }, + { + "epoch": 3.0487220447284344, + "grad_norm": 0.26794761419296265, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 3817 + }, + { + "epoch": 3.049520766773163, + "grad_norm": 0.18026290833950043, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3818 + }, + { + "epoch": 3.0503194888178915, + "grad_norm": 0.07298897206783295, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 3819 + }, + { + "epoch": 3.0511182108626196, + "grad_norm": 0.11078597605228424, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 3820 + }, + { + "epoch": 3.051916932907348, + "grad_norm": 0.13672129809856415, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3821 + }, + { + "epoch": 3.0527156549520766, + "grad_norm": 0.11172370612621307, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3822 + }, + { + "epoch": 3.053514376996805, + "grad_norm": 0.09000302106142044, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3823 + }, + { + "epoch": 3.0543130990415337, + "grad_norm": 0.055291030555963516, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3824 + }, + { + "epoch": 3.055111821086262, + "grad_norm": 0.05691349133849144, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3825 + }, + { + "epoch": 3.0559105431309903, + "grad_norm": 0.0744122862815857, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3826 + }, + { + "epoch": 3.056709265175719, + "grad_norm": 0.06438847631216049, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3827 + }, + { + "epoch": 3.0575079872204474, + "grad_norm": 0.0926717221736908, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3828 + }, + { + "epoch": 3.058306709265176, + "grad_norm": 0.15286727249622345, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3829 + }, + { + "epoch": 3.059105431309904, + "grad_norm": 0.2049989253282547, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3830 + }, + { + "epoch": 3.0599041533546325, + "grad_norm": 0.1832154393196106, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3831 + }, + { + "epoch": 3.060702875399361, + "grad_norm": 0.0953374058008194, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3832 + }, + { + "epoch": 3.0615015974440896, + "grad_norm": 0.063878633081913, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3833 + }, + { + "epoch": 3.062300319488818, + "grad_norm": 0.17062409222126007, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3834 + }, + { + "epoch": 3.063099041533546, + "grad_norm": 0.23467828333377838, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3835 + }, + { + "epoch": 3.0638977635782747, + "grad_norm": 0.19458062946796417, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3836 + }, + { + "epoch": 3.0646964856230032, + "grad_norm": 0.06614453345537186, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3837 + }, + { + "epoch": 3.0654952076677318, + "grad_norm": 0.1250256896018982, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3838 + }, + { + "epoch": 3.06629392971246, + "grad_norm": 0.2399163395166397, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3839 + }, + { + "epoch": 3.0670926517571884, + "grad_norm": 0.22544947266578674, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3840 + }, + { + "epoch": 3.067891373801917, + "grad_norm": 0.0710826963186264, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3841 + }, + { + "epoch": 3.0686900958466454, + "grad_norm": 0.12259501218795776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3842 + }, + { + "epoch": 3.069488817891374, + "grad_norm": 0.1313357651233673, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3843 + }, + { + "epoch": 3.070287539936102, + "grad_norm": 0.05492740869522095, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3844 + }, + { + "epoch": 3.0710862619808306, + "grad_norm": 0.08860959857702255, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3845 + }, + { + "epoch": 3.071884984025559, + "grad_norm": 0.12556305527687073, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3846 + }, + { + "epoch": 3.0726837060702876, + "grad_norm": 0.10780923813581467, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3847 + }, + { + "epoch": 3.073482428115016, + "grad_norm": 0.0587402880191803, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3848 + }, + { + "epoch": 3.0742811501597442, + "grad_norm": 0.06155085563659668, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3849 + }, + { + "epoch": 3.0750798722044728, + "grad_norm": 0.07258733361959457, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3850 + }, + { + "epoch": 3.0758785942492013, + "grad_norm": 0.060939520597457886, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3851 + }, + { + "epoch": 3.07667731629393, + "grad_norm": 0.07125407457351685, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3852 + }, + { + "epoch": 3.0774760383386583, + "grad_norm": 0.15338753163814545, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3853 + }, + { + "epoch": 3.0782747603833864, + "grad_norm": 0.18328991532325745, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3854 + }, + { + "epoch": 3.079073482428115, + "grad_norm": 0.1338629275560379, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3855 + }, + { + "epoch": 3.0798722044728435, + "grad_norm": 0.042017024010419846, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3856 + }, + { + "epoch": 3.080670926517572, + "grad_norm": 0.13696196675300598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3857 + }, + { + "epoch": 3.0814696485623, + "grad_norm": 0.17552919685840607, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3858 + }, + { + "epoch": 3.0822683706070286, + "grad_norm": 0.09906235337257385, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3859 + }, + { + "epoch": 3.083067092651757, + "grad_norm": 0.057398926466703415, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3860 + }, + { + "epoch": 3.0838658146964857, + "grad_norm": 0.12260781973600388, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3861 + }, + { + "epoch": 3.084664536741214, + "grad_norm": 0.12672549486160278, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3862 + }, + { + "epoch": 3.0854632587859423, + "grad_norm": 0.07239031046628952, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3863 + }, + { + "epoch": 3.086261980830671, + "grad_norm": 0.0928259864449501, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3864 + }, + { + "epoch": 3.0870607028753994, + "grad_norm": 0.2161056250333786, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3865 + }, + { + "epoch": 3.087859424920128, + "grad_norm": 0.21302388608455658, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3866 + }, + { + "epoch": 3.0886581469648564, + "grad_norm": 0.10730110853910446, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3867 + }, + { + "epoch": 3.0894568690095845, + "grad_norm": 0.06801975518465042, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3868 + }, + { + "epoch": 3.090255591054313, + "grad_norm": 0.09036632627248764, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3869 + }, + { + "epoch": 3.0910543130990416, + "grad_norm": 0.1344052255153656, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3870 + }, + { + "epoch": 3.09185303514377, + "grad_norm": 0.10774482041597366, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3871 + }, + { + "epoch": 3.0926517571884986, + "grad_norm": 0.06824023276567459, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3872 + }, + { + "epoch": 3.0934504792332267, + "grad_norm": 0.11959507316350937, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3873 + }, + { + "epoch": 3.094249201277955, + "grad_norm": 0.14943768084049225, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3874 + }, + { + "epoch": 3.0950479233226837, + "grad_norm": 0.13593481481075287, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3875 + }, + { + "epoch": 3.0958466453674123, + "grad_norm": 0.06872473657131195, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3876 + }, + { + "epoch": 3.0966453674121404, + "grad_norm": 0.07243353873491287, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3877 + }, + { + "epoch": 3.097444089456869, + "grad_norm": 0.07884293049573898, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3878 + }, + { + "epoch": 3.0982428115015974, + "grad_norm": 0.09574474394321442, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3879 + }, + { + "epoch": 3.099041533546326, + "grad_norm": 0.09028270840644836, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3880 + }, + { + "epoch": 3.0998402555910545, + "grad_norm": 0.056680940091609955, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3881 + }, + { + "epoch": 3.1006389776357826, + "grad_norm": 0.13817615807056427, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3882 + }, + { + "epoch": 3.101437699680511, + "grad_norm": 0.16102705895900726, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3883 + }, + { + "epoch": 3.1022364217252396, + "grad_norm": 0.08887791633605957, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3884 + }, + { + "epoch": 3.103035143769968, + "grad_norm": 0.055100735276937485, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3885 + }, + { + "epoch": 3.1038338658146967, + "grad_norm": 0.10710839927196503, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3886 + }, + { + "epoch": 3.1046325878594248, + "grad_norm": 0.09228713810443878, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3887 + }, + { + "epoch": 3.1054313099041533, + "grad_norm": 0.04602783918380737, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3888 + }, + { + "epoch": 3.106230031948882, + "grad_norm": 0.03584764152765274, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3889 + }, + { + "epoch": 3.1070287539936103, + "grad_norm": 0.04486532881855965, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3890 + }, + { + "epoch": 3.107827476038339, + "grad_norm": 0.036488354206085205, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3891 + }, + { + "epoch": 3.108626198083067, + "grad_norm": 0.04213477671146393, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3892 + }, + { + "epoch": 3.1094249201277955, + "grad_norm": 0.03840509057044983, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 3893 + }, + { + "epoch": 3.110223642172524, + "grad_norm": 0.04800419509410858, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3894 + }, + { + "epoch": 3.1110223642172525, + "grad_norm": 0.06467507034540176, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3895 + }, + { + "epoch": 3.1118210862619806, + "grad_norm": 0.05736416578292847, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3896 + }, + { + "epoch": 3.112619808306709, + "grad_norm": 0.03337489813566208, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3897 + }, + { + "epoch": 3.1134185303514377, + "grad_norm": 0.088229238986969, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3898 + }, + { + "epoch": 3.114217252396166, + "grad_norm": 0.1492392122745514, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3899 + }, + { + "epoch": 3.1150159744408947, + "grad_norm": 0.1699269413948059, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3900 + }, + { + "epoch": 3.115814696485623, + "grad_norm": 0.11532948911190033, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3901 + }, + { + "epoch": 3.1166134185303513, + "grad_norm": 0.030054764822125435, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3902 + }, + { + "epoch": 3.11741214057508, + "grad_norm": 0.11079075932502747, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 3903 + }, + { + "epoch": 3.1182108626198084, + "grad_norm": 0.15733082592487335, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3904 + }, + { + "epoch": 3.119009584664537, + "grad_norm": 0.12520034611225128, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 3905 + }, + { + "epoch": 3.119808306709265, + "grad_norm": 0.03382280096411705, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3906 + }, + { + "epoch": 3.1206070287539935, + "grad_norm": 0.11951576173305511, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3907 + }, + { + "epoch": 3.121405750798722, + "grad_norm": 0.2123839259147644, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3908 + }, + { + "epoch": 3.1222044728434506, + "grad_norm": 0.15437674522399902, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3909 + }, + { + "epoch": 3.123003194888179, + "grad_norm": 0.06463608890771866, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3910 + }, + { + "epoch": 3.123801916932907, + "grad_norm": 0.10830746591091156, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3911 + }, + { + "epoch": 3.1246006389776357, + "grad_norm": 0.17621003091335297, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3912 + }, + { + "epoch": 3.1253993610223643, + "grad_norm": 0.12417379021644592, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3913 + }, + { + "epoch": 3.126198083067093, + "grad_norm": 0.05364898219704628, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3914 + }, + { + "epoch": 3.126996805111821, + "grad_norm": 0.17589502036571503, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3915 + }, + { + "epoch": 3.1277955271565494, + "grad_norm": 0.249656081199646, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3916 + }, + { + "epoch": 3.128594249201278, + "grad_norm": 0.1800973266363144, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3917 + }, + { + "epoch": 3.1293929712460065, + "grad_norm": 0.09763745218515396, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3918 + }, + { + "epoch": 3.130191693290735, + "grad_norm": 0.10953835397958755, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3919 + }, + { + "epoch": 3.130990415335463, + "grad_norm": 0.17490456998348236, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3920 + }, + { + "epoch": 3.1317891373801916, + "grad_norm": 0.11533153057098389, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3921 + }, + { + "epoch": 3.13258785942492, + "grad_norm": 0.07494231313467026, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3922 + }, + { + "epoch": 3.1333865814696487, + "grad_norm": 0.14954763650894165, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3923 + }, + { + "epoch": 3.134185303514377, + "grad_norm": 0.18061646819114685, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3924 + }, + { + "epoch": 3.1349840255591053, + "grad_norm": 0.10419650375843048, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3925 + }, + { + "epoch": 3.135782747603834, + "grad_norm": 0.04677566513419151, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3926 + }, + { + "epoch": 3.1365814696485623, + "grad_norm": 0.12846903502941132, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3927 + }, + { + "epoch": 3.137380191693291, + "grad_norm": 0.11824795603752136, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3928 + }, + { + "epoch": 3.1381789137380194, + "grad_norm": 0.04194530099630356, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3929 + }, + { + "epoch": 3.1389776357827475, + "grad_norm": 0.15154412388801575, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3930 + }, + { + "epoch": 3.139776357827476, + "grad_norm": 0.19073615968227386, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3931 + }, + { + "epoch": 3.1405750798722045, + "grad_norm": 0.12614648044109344, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3932 + }, + { + "epoch": 3.141373801916933, + "grad_norm": 0.03434520214796066, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3933 + }, + { + "epoch": 3.142172523961661, + "grad_norm": 0.11913489550352097, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3934 + }, + { + "epoch": 3.1429712460063897, + "grad_norm": 0.16297172009944916, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3935 + }, + { + "epoch": 3.143769968051118, + "grad_norm": 0.15605789422988892, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3936 + }, + { + "epoch": 3.1445686900958467, + "grad_norm": 0.10524406284093857, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3937 + }, + { + "epoch": 3.1453674121405752, + "grad_norm": 0.03763152286410332, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3938 + }, + { + "epoch": 3.1461661341853033, + "grad_norm": 0.07586465775966644, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3939 + }, + { + "epoch": 3.146964856230032, + "grad_norm": 0.14553581178188324, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3940 + }, + { + "epoch": 3.1477635782747604, + "grad_norm": 0.1883595883846283, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3941 + }, + { + "epoch": 3.148562300319489, + "grad_norm": 0.13018599152565002, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3942 + }, + { + "epoch": 3.1493610223642174, + "grad_norm": 0.05356704071164131, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3943 + }, + { + "epoch": 3.1501597444089455, + "grad_norm": 0.2083088606595993, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3944 + }, + { + "epoch": 3.150958466453674, + "grad_norm": 0.2586681544780731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3945 + }, + { + "epoch": 3.1517571884984026, + "grad_norm": 0.18733063340187073, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3946 + }, + { + "epoch": 3.152555910543131, + "grad_norm": 0.03741752356290817, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3947 + }, + { + "epoch": 3.1533546325878596, + "grad_norm": 0.11660216003656387, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3948 + }, + { + "epoch": 3.1541533546325877, + "grad_norm": 0.12698383629322052, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3949 + }, + { + "epoch": 3.1549520766773163, + "grad_norm": 0.10244922339916229, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3950 + }, + { + "epoch": 3.155750798722045, + "grad_norm": 0.03815237060189247, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3951 + }, + { + "epoch": 3.1565495207667733, + "grad_norm": 0.04394761845469475, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3952 + }, + { + "epoch": 3.1573482428115014, + "grad_norm": 0.1344541311264038, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3953 + }, + { + "epoch": 3.15814696485623, + "grad_norm": 0.23006947338581085, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3954 + }, + { + "epoch": 3.1589456869009584, + "grad_norm": 0.2667021155357361, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3955 + }, + { + "epoch": 3.159744408945687, + "grad_norm": 0.2410362809896469, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3956 + }, + { + "epoch": 3.1605431309904155, + "grad_norm": 0.1421661078929901, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3957 + }, + { + "epoch": 3.1613418530351436, + "grad_norm": 0.04178561642765999, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3958 + }, + { + "epoch": 3.162140575079872, + "grad_norm": 0.15327088534832, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3959 + }, + { + "epoch": 3.1629392971246006, + "grad_norm": 0.1372532993555069, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3960 + }, + { + "epoch": 3.163738019169329, + "grad_norm": 0.03763817250728607, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3961 + }, + { + "epoch": 3.1645367412140577, + "grad_norm": 0.13227587938308716, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3962 + }, + { + "epoch": 3.165335463258786, + "grad_norm": 0.1952073723077774, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3963 + }, + { + "epoch": 3.1661341853035143, + "grad_norm": 0.1672048568725586, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3964 + }, + { + "epoch": 3.166932907348243, + "grad_norm": 0.09593698382377625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3965 + }, + { + "epoch": 3.1677316293929714, + "grad_norm": 0.03619454428553581, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3966 + }, + { + "epoch": 3.1685303514377, + "grad_norm": 0.05974683538079262, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3967 + }, + { + "epoch": 3.169329073482428, + "grad_norm": 0.09733424335718155, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3968 + }, + { + "epoch": 3.1701277955271565, + "grad_norm": 0.07536087185144424, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3969 + }, + { + "epoch": 3.170926517571885, + "grad_norm": 0.04263869300484657, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3970 + }, + { + "epoch": 3.1717252396166136, + "grad_norm": 0.040521468967199326, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3971 + }, + { + "epoch": 3.1725239616613417, + "grad_norm": 0.05615096539258957, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3972 + }, + { + "epoch": 3.17332268370607, + "grad_norm": 0.06655194610357285, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3973 + }, + { + "epoch": 3.1741214057507987, + "grad_norm": 0.07300302386283875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3974 + }, + { + "epoch": 3.1749201277955272, + "grad_norm": 0.04789174720644951, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3975 + }, + { + "epoch": 3.1757188498402558, + "grad_norm": 0.03460157290101051, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3976 + }, + { + "epoch": 3.176517571884984, + "grad_norm": 0.0393557995557785, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3977 + }, + { + "epoch": 3.1773162939297124, + "grad_norm": 0.062453389167785645, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3978 + }, + { + "epoch": 3.178115015974441, + "grad_norm": 0.08542043715715408, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3979 + }, + { + "epoch": 3.1789137380191694, + "grad_norm": 0.08002828061580658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3980 + }, + { + "epoch": 3.179712460063898, + "grad_norm": 0.04635196551680565, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3981 + }, + { + "epoch": 3.180511182108626, + "grad_norm": 0.09583642333745956, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3982 + }, + { + "epoch": 3.1813099041533546, + "grad_norm": 0.12418454885482788, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3983 + }, + { + "epoch": 3.182108626198083, + "grad_norm": 0.10457618534564972, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3984 + }, + { + "epoch": 3.1829073482428116, + "grad_norm": 0.07183804363012314, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3985 + }, + { + "epoch": 3.18370607028754, + "grad_norm": 0.039956409484148026, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3986 + }, + { + "epoch": 3.1845047923322682, + "grad_norm": 0.0884016826748848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3987 + }, + { + "epoch": 3.1853035143769968, + "grad_norm": 0.112494558095932, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3988 + }, + { + "epoch": 3.1861022364217253, + "grad_norm": 0.07582054287195206, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3989 + }, + { + "epoch": 3.186900958466454, + "grad_norm": 0.060303278267383575, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3990 + }, + { + "epoch": 3.187699680511182, + "grad_norm": 0.048326775431632996, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3991 + }, + { + "epoch": 3.1884984025559104, + "grad_norm": 0.32322436571121216, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3992 + }, + { + "epoch": 3.189297124600639, + "grad_norm": 0.5569815039634705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3993 + }, + { + "epoch": 3.1900958466453675, + "grad_norm": 0.7590563893318176, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 3994 + }, + { + "epoch": 3.190894568690096, + "grad_norm": 0.6537879705429077, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3995 + }, + { + "epoch": 3.191693290734824, + "grad_norm": 0.16556645929813385, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3996 + }, + { + "epoch": 3.1924920127795526, + "grad_norm": 0.3745940625667572, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3997 + }, + { + "epoch": 3.193290734824281, + "grad_norm": 0.5159009695053101, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 3998 + }, + { + "epoch": 3.1940894568690097, + "grad_norm": 0.1302756816148758, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3999 + }, + { + "epoch": 3.194888178913738, + "grad_norm": 0.3484213054180145, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4000 + }, + { + "epoch": 3.1956869009584663, + "grad_norm": 0.23763029277324677, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4001 + }, + { + "epoch": 3.196485623003195, + "grad_norm": 0.20648746192455292, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4002 + }, + { + "epoch": 3.1972843450479234, + "grad_norm": 0.31230399012565613, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4003 + }, + { + "epoch": 3.198083067092652, + "grad_norm": 0.15389247238636017, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4004 + }, + { + "epoch": 3.1988817891373804, + "grad_norm": 0.6544334292411804, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4005 + }, + { + "epoch": 3.1996805111821085, + "grad_norm": 0.5409669280052185, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4006 + }, + { + "epoch": 3.200479233226837, + "grad_norm": 0.11126074194908142, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4007 + }, + { + "epoch": 3.2012779552715656, + "grad_norm": 0.3257724642753601, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4008 + }, + { + "epoch": 3.202076677316294, + "grad_norm": 0.4188903272151947, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4009 + }, + { + "epoch": 3.202875399361022, + "grad_norm": 0.1012830138206482, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4010 + }, + { + "epoch": 3.2036741214057507, + "grad_norm": 0.2771216034889221, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4011 + }, + { + "epoch": 3.2044728434504792, + "grad_norm": 0.2873278260231018, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4012 + }, + { + "epoch": 3.2052715654952078, + "grad_norm": 0.09620041400194168, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4013 + }, + { + "epoch": 3.2060702875399363, + "grad_norm": 0.10561787337064743, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4014 + }, + { + "epoch": 3.2068690095846644, + "grad_norm": 0.12499046325683594, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4015 + }, + { + "epoch": 3.207667731629393, + "grad_norm": 0.4055064916610718, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4016 + }, + { + "epoch": 3.2084664536741214, + "grad_norm": 0.9722099900245667, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 4017 + }, + { + "epoch": 3.20926517571885, + "grad_norm": 0.7367122173309326, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 4018 + }, + { + "epoch": 3.2100638977635785, + "grad_norm": 0.4455755650997162, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4019 + }, + { + "epoch": 3.2108626198083066, + "grad_norm": 0.10350961983203888, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4020 + }, + { + "epoch": 3.211661341853035, + "grad_norm": 0.41901662945747375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4021 + }, + { + "epoch": 3.2124600638977636, + "grad_norm": 0.5987749695777893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4022 + }, + { + "epoch": 3.213258785942492, + "grad_norm": 1.5967272520065308, + "learning_rate": 0.0005, + "loss": 1.1938, + "step": 4023 + }, + { + "epoch": 3.2140575079872207, + "grad_norm": 3.289113759994507, + "learning_rate": 0.0005, + "loss": 1.2474, + "step": 4024 + }, + { + "epoch": 3.2148562300319488, + "grad_norm": 0.40220701694488525, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 4025 + }, + { + "epoch": 3.2156549520766773, + "grad_norm": 0.15129008889198303, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4026 + }, + { + "epoch": 3.216453674121406, + "grad_norm": 19.060272216796875, + "learning_rate": 0.0005, + "loss": 1.4668, + "step": 4027 + }, + { + "epoch": 3.2172523961661343, + "grad_norm": 1.72987961769104, + "learning_rate": 0.0005, + "loss": 1.3675, + "step": 4028 + }, + { + "epoch": 3.2180511182108624, + "grad_norm": 2.1064836978912354, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 4029 + }, + { + "epoch": 3.218849840255591, + "grad_norm": 1.0206952095031738, + "learning_rate": 0.0005, + "loss": 1.2602, + "step": 4030 + }, + { + "epoch": 3.2196485623003195, + "grad_norm": 14.109564781188965, + "learning_rate": 0.0005, + "loss": 3.9831, + "step": 4031 + }, + { + "epoch": 3.220447284345048, + "grad_norm": 12.518637657165527, + "learning_rate": 0.0005, + "loss": 3.4388, + "step": 4032 + }, + { + "epoch": 3.2212460063897765, + "grad_norm": 4.156238079071045, + "learning_rate": 0.0005, + "loss": 2.1713, + "step": 4033 + }, + { + "epoch": 3.2220447284345046, + "grad_norm": 2.752128839492798, + "learning_rate": 0.0005, + "loss": 1.6581, + "step": 4034 + }, + { + "epoch": 3.222843450479233, + "grad_norm": 5.876696586608887, + "learning_rate": 0.0005, + "loss": 2.1698, + "step": 4035 + }, + { + "epoch": 3.2236421725239617, + "grad_norm": 7.60305118560791, + "learning_rate": 0.0005, + "loss": 3.0713, + "step": 4036 + }, + { + "epoch": 3.22444089456869, + "grad_norm": 2.581448554992676, + "learning_rate": 0.0005, + "loss": 1.7677, + "step": 4037 + }, + { + "epoch": 3.2252396166134187, + "grad_norm": 1.0544116497039795, + "learning_rate": 0.0005, + "loss": 1.4604, + "step": 4038 + }, + { + "epoch": 3.226038338658147, + "grad_norm": 10.742961883544922, + "learning_rate": 0.0005, + "loss": 3.8634, + "step": 4039 + }, + { + "epoch": 3.2268370607028753, + "grad_norm": 6.555435657501221, + "learning_rate": 0.0005, + "loss": 2.7229, + "step": 4040 + }, + { + "epoch": 3.227635782747604, + "grad_norm": 4.335379600524902, + "learning_rate": 0.0005, + "loss": 2.548, + "step": 4041 + }, + { + "epoch": 3.2284345047923324, + "grad_norm": 3.9863200187683105, + "learning_rate": 0.0005, + "loss": 2.5051, + "step": 4042 + }, + { + "epoch": 3.229233226837061, + "grad_norm": 3.4922895431518555, + "learning_rate": 0.0005, + "loss": 2.1996, + "step": 4043 + }, + { + "epoch": 3.230031948881789, + "grad_norm": 0.9404768347740173, + "learning_rate": 0.0005, + "loss": 1.7869, + "step": 4044 + }, + { + "epoch": 3.2308306709265175, + "grad_norm": 1.2953938245773315, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 4045 + }, + { + "epoch": 3.231629392971246, + "grad_norm": 2.0215165615081787, + "learning_rate": 0.0005, + "loss": 1.9429, + "step": 4046 + }, + { + "epoch": 3.2324281150159746, + "grad_norm": 1.2744032144546509, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 4047 + }, + { + "epoch": 3.2332268370607027, + "grad_norm": 2.042656660079956, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 4048 + }, + { + "epoch": 3.234025559105431, + "grad_norm": 6.607172012329102, + "learning_rate": 0.0005, + "loss": 2.8381, + "step": 4049 + }, + { + "epoch": 3.2348242811501597, + "grad_norm": 1.2499932050704956, + "learning_rate": 0.0005, + "loss": 1.6324, + "step": 4050 + }, + { + "epoch": 3.2356230031948883, + "grad_norm": 1.1896424293518066, + "learning_rate": 0.0005, + "loss": 1.7201, + "step": 4051 + }, + { + "epoch": 3.236421725239617, + "grad_norm": 1.9901418685913086, + "learning_rate": 0.0005, + "loss": 1.7335, + "step": 4052 + }, + { + "epoch": 3.237220447284345, + "grad_norm": 0.8886330127716064, + "learning_rate": 0.0005, + "loss": 1.5111, + "step": 4053 + }, + { + "epoch": 3.2380191693290734, + "grad_norm": 2.6570353507995605, + "learning_rate": 0.0005, + "loss": 1.8628, + "step": 4054 + }, + { + "epoch": 3.238817891373802, + "grad_norm": 2.212905168533325, + "learning_rate": 0.0005, + "loss": 1.5838, + "step": 4055 + }, + { + "epoch": 3.2396166134185305, + "grad_norm": 3.1234660148620605, + "learning_rate": 0.0005, + "loss": 1.7212, + "step": 4056 + }, + { + "epoch": 3.2404153354632586, + "grad_norm": 0.9168338775634766, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 4057 + }, + { + "epoch": 3.241214057507987, + "grad_norm": 0.8366042971611023, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 4058 + }, + { + "epoch": 3.2420127795527156, + "grad_norm": 0.5359059572219849, + "learning_rate": 0.0005, + "loss": 1.4185, + "step": 4059 + }, + { + "epoch": 3.242811501597444, + "grad_norm": 1.8511804342269897, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 4060 + }, + { + "epoch": 3.2436102236421727, + "grad_norm": 1.3229485750198364, + "learning_rate": 0.0005, + "loss": 1.4497, + "step": 4061 + }, + { + "epoch": 3.244408945686901, + "grad_norm": 0.8846393823623657, + "learning_rate": 0.0005, + "loss": 1.384, + "step": 4062 + }, + { + "epoch": 3.2452076677316293, + "grad_norm": 1.1345176696777344, + "learning_rate": 0.0005, + "loss": 1.3906, + "step": 4063 + }, + { + "epoch": 3.246006389776358, + "grad_norm": 0.998261034488678, + "learning_rate": 0.0005, + "loss": 1.3807, + "step": 4064 + }, + { + "epoch": 3.2468051118210863, + "grad_norm": 0.8998358249664307, + "learning_rate": 0.0005, + "loss": 1.3321, + "step": 4065 + }, + { + "epoch": 3.247603833865815, + "grad_norm": 0.6892838478088379, + "learning_rate": 0.0005, + "loss": 1.3718, + "step": 4066 + }, + { + "epoch": 3.248402555910543, + "grad_norm": 0.515389084815979, + "learning_rate": 0.0005, + "loss": 1.3296, + "step": 4067 + }, + { + "epoch": 3.2492012779552715, + "grad_norm": 0.41038376092910767, + "learning_rate": 0.0005, + "loss": 1.2855, + "step": 4068 + }, + { + "epoch": 3.25, + "grad_norm": 0.6094494462013245, + "learning_rate": 0.0005, + "loss": 1.2953, + "step": 4069 + }, + { + "epoch": 3.2507987220447285, + "grad_norm": 0.6274027228355408, + "learning_rate": 0.0005, + "loss": 1.2879, + "step": 4070 + }, + { + "epoch": 3.251597444089457, + "grad_norm": 0.8833006024360657, + "learning_rate": 0.0005, + "loss": 1.2806, + "step": 4071 + }, + { + "epoch": 3.252396166134185, + "grad_norm": 0.8688742518424988, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 4072 + }, + { + "epoch": 3.2531948881789137, + "grad_norm": 0.34751075506210327, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 4073 + }, + { + "epoch": 3.253993610223642, + "grad_norm": 0.4245823621749878, + "learning_rate": 0.0005, + "loss": 1.2529, + "step": 4074 + }, + { + "epoch": 3.2547923322683707, + "grad_norm": 0.4495961368083954, + "learning_rate": 0.0005, + "loss": 1.2438, + "step": 4075 + }, + { + "epoch": 3.255591054313099, + "grad_norm": 0.683125913143158, + "learning_rate": 0.0005, + "loss": 1.2297, + "step": 4076 + }, + { + "epoch": 3.2563897763578273, + "grad_norm": 0.4342438876628876, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 4077 + }, + { + "epoch": 3.257188498402556, + "grad_norm": 0.2018793523311615, + "learning_rate": 0.0005, + "loss": 1.2313, + "step": 4078 + }, + { + "epoch": 3.2579872204472844, + "grad_norm": 0.26145434379577637, + "learning_rate": 0.0005, + "loss": 1.218, + "step": 4079 + }, + { + "epoch": 3.258785942492013, + "grad_norm": 0.16941657662391663, + "learning_rate": 0.0005, + "loss": 1.2211, + "step": 4080 + }, + { + "epoch": 3.2595846645367414, + "grad_norm": 0.3158339262008667, + "learning_rate": 0.0005, + "loss": 1.2192, + "step": 4081 + }, + { + "epoch": 3.2603833865814695, + "grad_norm": 0.18630816042423248, + "learning_rate": 0.0005, + "loss": 1.2091, + "step": 4082 + }, + { + "epoch": 3.261182108626198, + "grad_norm": 0.19504855573177338, + "learning_rate": 0.0005, + "loss": 1.2047, + "step": 4083 + }, + { + "epoch": 3.2619808306709266, + "grad_norm": 0.19672146439552307, + "learning_rate": 0.0005, + "loss": 1.2022, + "step": 4084 + }, + { + "epoch": 3.262779552715655, + "grad_norm": 0.15959087014198303, + "learning_rate": 0.0005, + "loss": 1.1957, + "step": 4085 + }, + { + "epoch": 3.263578274760383, + "grad_norm": 0.18326745927333832, + "learning_rate": 0.0005, + "loss": 1.1835, + "step": 4086 + }, + { + "epoch": 3.2643769968051117, + "grad_norm": 0.23495830595493317, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 4087 + }, + { + "epoch": 3.2651757188498403, + "grad_norm": 0.22718247771263123, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 4088 + }, + { + "epoch": 3.265974440894569, + "grad_norm": 0.2913427948951721, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 4089 + }, + { + "epoch": 3.2667731629392973, + "grad_norm": 0.44531312584877014, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 4090 + }, + { + "epoch": 3.2675718849840254, + "grad_norm": 0.6265004277229309, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 4091 + }, + { + "epoch": 3.268370607028754, + "grad_norm": 0.6119574904441833, + "learning_rate": 0.0005, + "loss": 1.186, + "step": 4092 + }, + { + "epoch": 3.2691693290734825, + "grad_norm": 0.23989497125148773, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 4093 + }, + { + "epoch": 3.269968051118211, + "grad_norm": 0.266013503074646, + "learning_rate": 0.0005, + "loss": 1.1693, + "step": 4094 + }, + { + "epoch": 3.270766773162939, + "grad_norm": 0.2205667793750763, + "learning_rate": 0.0005, + "loss": 1.1627, + "step": 4095 + }, + { + "epoch": 3.2715654952076676, + "grad_norm": 0.4600715935230255, + "learning_rate": 0.0005, + "loss": 1.1566, + "step": 4096 + }, + { + "epoch": 3.272364217252396, + "grad_norm": 0.6725661754608154, + "learning_rate": 0.0005, + "loss": 1.1806, + "step": 4097 + }, + { + "epoch": 3.2731629392971247, + "grad_norm": 0.3836606442928314, + "learning_rate": 0.0005, + "loss": 1.1613, + "step": 4098 + }, + { + "epoch": 3.273961661341853, + "grad_norm": 0.3752588629722595, + "learning_rate": 0.0005, + "loss": 1.1639, + "step": 4099 + }, + { + "epoch": 3.2747603833865817, + "grad_norm": 0.3297381103038788, + "learning_rate": 0.0005, + "loss": 1.1488, + "step": 4100 + }, + { + "epoch": 3.27555910543131, + "grad_norm": 0.5899438858032227, + "learning_rate": 0.0005, + "loss": 1.1486, + "step": 4101 + }, + { + "epoch": 3.2763578274760383, + "grad_norm": 0.5899466872215271, + "learning_rate": 0.0005, + "loss": 1.1533, + "step": 4102 + }, + { + "epoch": 3.277156549520767, + "grad_norm": 0.2944958209991455, + "learning_rate": 0.0005, + "loss": 1.1517, + "step": 4103 + }, + { + "epoch": 3.2779552715654954, + "grad_norm": 0.5870373249053955, + "learning_rate": 0.0005, + "loss": 1.1484, + "step": 4104 + }, + { + "epoch": 3.2787539936102235, + "grad_norm": 0.25267326831817627, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 4105 + }, + { + "epoch": 3.279552715654952, + "grad_norm": 0.20602582395076752, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 4106 + }, + { + "epoch": 3.2803514376996805, + "grad_norm": 0.4151447117328644, + "learning_rate": 0.0005, + "loss": 1.1338, + "step": 4107 + }, + { + "epoch": 3.281150159744409, + "grad_norm": 0.6591519117355347, + "learning_rate": 0.0005, + "loss": 1.1395, + "step": 4108 + }, + { + "epoch": 3.2819488817891376, + "grad_norm": 0.48510807752609253, + "learning_rate": 0.0005, + "loss": 1.1496, + "step": 4109 + }, + { + "epoch": 3.2827476038338657, + "grad_norm": 0.27803128957748413, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 4110 + }, + { + "epoch": 3.283546325878594, + "grad_norm": 0.3939184546470642, + "learning_rate": 0.0005, + "loss": 1.141, + "step": 4111 + }, + { + "epoch": 3.2843450479233227, + "grad_norm": 0.18271984159946442, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 4112 + }, + { + "epoch": 3.2851437699680512, + "grad_norm": 0.19690747559070587, + "learning_rate": 0.0005, + "loss": 1.1286, + "step": 4113 + }, + { + "epoch": 3.2859424920127793, + "grad_norm": 0.22968755662441254, + "learning_rate": 0.0005, + "loss": 1.1316, + "step": 4114 + }, + { + "epoch": 3.286741214057508, + "grad_norm": 0.24908174574375153, + "learning_rate": 0.0005, + "loss": 1.1279, + "step": 4115 + }, + { + "epoch": 3.2875399361022364, + "grad_norm": 0.15813285112380981, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 4116 + }, + { + "epoch": 3.288338658146965, + "grad_norm": 0.1056000292301178, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 4117 + }, + { + "epoch": 3.2891373801916934, + "grad_norm": 0.19983351230621338, + "learning_rate": 0.0005, + "loss": 1.118, + "step": 4118 + }, + { + "epoch": 3.289936102236422, + "grad_norm": 0.13660027086734772, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 4119 + }, + { + "epoch": 3.29073482428115, + "grad_norm": 0.15008457005023956, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 4120 + }, + { + "epoch": 3.2915335463258786, + "grad_norm": 0.1475287824869156, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 4121 + }, + { + "epoch": 3.292332268370607, + "grad_norm": 0.10478811711072922, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 4122 + }, + { + "epoch": 3.2931309904153356, + "grad_norm": 0.1577034890651703, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 4123 + }, + { + "epoch": 3.2939297124600637, + "grad_norm": 0.1019970178604126, + "learning_rate": 0.0005, + "loss": 1.1117, + "step": 4124 + }, + { + "epoch": 3.2947284345047922, + "grad_norm": 0.09229713678359985, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 4125 + }, + { + "epoch": 3.2955271565495208, + "grad_norm": 0.10029986500740051, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 4126 + }, + { + "epoch": 3.2963258785942493, + "grad_norm": 0.14171569049358368, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4127 + }, + { + "epoch": 3.297124600638978, + "grad_norm": 0.17343609035015106, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 4128 + }, + { + "epoch": 3.297923322683706, + "grad_norm": 0.2738705277442932, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 4129 + }, + { + "epoch": 3.2987220447284344, + "grad_norm": 0.3518083691596985, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 4130 + }, + { + "epoch": 3.299520766773163, + "grad_norm": 0.16174353659152985, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 4131 + }, + { + "epoch": 3.3003194888178915, + "grad_norm": 0.24402645230293274, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4132 + }, + { + "epoch": 3.3011182108626196, + "grad_norm": 0.23362669348716736, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 4133 + }, + { + "epoch": 3.301916932907348, + "grad_norm": 0.1391523778438568, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 4134 + }, + { + "epoch": 3.3027156549520766, + "grad_norm": 0.1516295224428177, + "learning_rate": 0.0005, + "loss": 1.0968, + "step": 4135 + }, + { + "epoch": 3.303514376996805, + "grad_norm": 0.17463526129722595, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 4136 + }, + { + "epoch": 3.3043130990415337, + "grad_norm": 0.13717398047447205, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 4137 + }, + { + "epoch": 3.3051118210862622, + "grad_norm": 0.16802728176116943, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 4138 + }, + { + "epoch": 3.3059105431309903, + "grad_norm": 0.11959057301282883, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 4139 + }, + { + "epoch": 3.306709265175719, + "grad_norm": 0.07706355303525925, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4140 + }, + { + "epoch": 3.3075079872204474, + "grad_norm": 0.07729125767946243, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 4141 + }, + { + "epoch": 3.308306709265176, + "grad_norm": 0.08654871582984924, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4142 + }, + { + "epoch": 3.309105431309904, + "grad_norm": 0.11485479772090912, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 4143 + }, + { + "epoch": 3.3099041533546325, + "grad_norm": 0.10812658816576004, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4144 + }, + { + "epoch": 3.310702875399361, + "grad_norm": 0.08537860214710236, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4145 + }, + { + "epoch": 3.3115015974440896, + "grad_norm": 0.10628878325223923, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 4146 + }, + { + "epoch": 3.312300319488818, + "grad_norm": 0.14903275668621063, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 4147 + }, + { + "epoch": 3.313099041533546, + "grad_norm": 0.09670894593000412, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4148 + }, + { + "epoch": 3.3138977635782747, + "grad_norm": 0.10959025472402573, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 4149 + }, + { + "epoch": 3.3146964856230032, + "grad_norm": 0.10397703945636749, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 4150 + }, + { + "epoch": 3.3154952076677318, + "grad_norm": 0.07681623846292496, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 4151 + }, + { + "epoch": 3.31629392971246, + "grad_norm": 0.07938152551651001, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 4152 + }, + { + "epoch": 3.3170926517571884, + "grad_norm": 0.14678052067756653, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 4153 + }, + { + "epoch": 3.317891373801917, + "grad_norm": 0.15366105735301971, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4154 + }, + { + "epoch": 3.3186900958466454, + "grad_norm": 0.13449597358703613, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 4155 + }, + { + "epoch": 3.319488817891374, + "grad_norm": 0.0861068144440651, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 4156 + }, + { + "epoch": 3.3202875399361025, + "grad_norm": 0.0604286752641201, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 4157 + }, + { + "epoch": 3.3210862619808306, + "grad_norm": 0.08299542963504791, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4158 + }, + { + "epoch": 3.321884984025559, + "grad_norm": 0.0738200917840004, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 4159 + }, + { + "epoch": 3.3226837060702876, + "grad_norm": 0.06450676172971725, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4160 + }, + { + "epoch": 3.323482428115016, + "grad_norm": 0.055281370878219604, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4161 + }, + { + "epoch": 3.3242811501597442, + "grad_norm": 0.09895910322666168, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4162 + }, + { + "epoch": 3.3250798722044728, + "grad_norm": 0.10338333994150162, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 4163 + }, + { + "epoch": 3.3258785942492013, + "grad_norm": 0.08346354216337204, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 4164 + }, + { + "epoch": 3.32667731629393, + "grad_norm": 0.15257857739925385, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 4165 + }, + { + "epoch": 3.3274760383386583, + "grad_norm": 0.1782383918762207, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4166 + }, + { + "epoch": 3.3282747603833864, + "grad_norm": 0.09908363968133926, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4167 + }, + { + "epoch": 3.329073482428115, + "grad_norm": 0.18942143023014069, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 4168 + }, + { + "epoch": 3.3298722044728435, + "grad_norm": 0.21095149219036102, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4169 + }, + { + "epoch": 3.330670926517572, + "grad_norm": 0.11597894132137299, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4170 + }, + { + "epoch": 3.3314696485623, + "grad_norm": 0.20450811088085175, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 4171 + }, + { + "epoch": 3.3322683706070286, + "grad_norm": 0.1609300971031189, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4172 + }, + { + "epoch": 3.333067092651757, + "grad_norm": 0.14068877696990967, + "learning_rate": 0.0005, + "loss": 1.0835, + "step": 4173 + }, + { + "epoch": 3.3338658146964857, + "grad_norm": 0.11969266831874847, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 4174 + }, + { + "epoch": 3.334664536741214, + "grad_norm": 0.16986626386642456, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4175 + }, + { + "epoch": 3.3354632587859427, + "grad_norm": 0.2065591812133789, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4176 + }, + { + "epoch": 3.336261980830671, + "grad_norm": 0.23542748391628265, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 4177 + }, + { + "epoch": 3.3370607028753994, + "grad_norm": 0.20896919071674347, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 4178 + }, + { + "epoch": 3.337859424920128, + "grad_norm": 0.16446076333522797, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 4179 + }, + { + "epoch": 3.3386581469648564, + "grad_norm": 0.11143177002668381, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 4180 + }, + { + "epoch": 3.3394568690095845, + "grad_norm": 0.0866970345377922, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 4181 + }, + { + "epoch": 3.340255591054313, + "grad_norm": 0.14608244597911835, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 4182 + }, + { + "epoch": 3.3410543130990416, + "grad_norm": 0.06152384728193283, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4183 + }, + { + "epoch": 3.34185303514377, + "grad_norm": 0.14289656281471252, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4184 + }, + { + "epoch": 3.3426517571884986, + "grad_norm": 0.16735558211803436, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4185 + }, + { + "epoch": 3.3434504792332267, + "grad_norm": 0.09012678265571594, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 4186 + }, + { + "epoch": 3.344249201277955, + "grad_norm": 0.05861378088593483, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4187 + }, + { + "epoch": 3.3450479233226837, + "grad_norm": 0.07123090326786041, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4188 + }, + { + "epoch": 3.3458466453674123, + "grad_norm": 0.07879375666379929, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4189 + }, + { + "epoch": 3.3466453674121404, + "grad_norm": 0.0925324484705925, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 4190 + }, + { + "epoch": 3.347444089456869, + "grad_norm": 0.0686444416642189, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4191 + }, + { + "epoch": 3.3482428115015974, + "grad_norm": 0.08633724600076675, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4192 + }, + { + "epoch": 3.349041533546326, + "grad_norm": 0.056881021708250046, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4193 + }, + { + "epoch": 3.3498402555910545, + "grad_norm": 0.07752947509288788, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4194 + }, + { + "epoch": 3.3506389776357826, + "grad_norm": 0.0927717313170433, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 4195 + }, + { + "epoch": 3.351437699680511, + "grad_norm": 0.09599179029464722, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 4196 + }, + { + "epoch": 3.3522364217252396, + "grad_norm": 0.09090889245271683, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 4197 + }, + { + "epoch": 3.353035143769968, + "grad_norm": 0.12757429480552673, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 4198 + }, + { + "epoch": 3.3538338658146967, + "grad_norm": 0.15210460126399994, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4199 + }, + { + "epoch": 3.3546325878594248, + "grad_norm": 0.10982836782932281, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4200 + }, + { + "epoch": 3.3554313099041533, + "grad_norm": 0.056641776114702225, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4201 + }, + { + "epoch": 3.356230031948882, + "grad_norm": 0.09506776928901672, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4202 + }, + { + "epoch": 3.3570287539936103, + "grad_norm": 0.12064918130636215, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4203 + }, + { + "epoch": 3.357827476038339, + "grad_norm": 0.12343298643827438, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 4204 + }, + { + "epoch": 3.358626198083067, + "grad_norm": 0.11508476734161377, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4205 + }, + { + "epoch": 3.3594249201277955, + "grad_norm": 0.07552453875541687, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4206 + }, + { + "epoch": 3.360223642172524, + "grad_norm": 0.10495936870574951, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 4207 + }, + { + "epoch": 3.3610223642172525, + "grad_norm": 0.13230633735656738, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4208 + }, + { + "epoch": 3.3618210862619806, + "grad_norm": 0.13003787398338318, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4209 + }, + { + "epoch": 3.362619808306709, + "grad_norm": 0.09252234548330307, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 4210 + }, + { + "epoch": 3.3634185303514377, + "grad_norm": 0.07739317417144775, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 4211 + }, + { + "epoch": 3.364217252396166, + "grad_norm": 0.12185318768024445, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4212 + }, + { + "epoch": 3.3650159744408947, + "grad_norm": 0.17643119394779205, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4213 + }, + { + "epoch": 3.365814696485623, + "grad_norm": 0.10462872684001923, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4214 + }, + { + "epoch": 3.3666134185303513, + "grad_norm": 0.1486569344997406, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 4215 + }, + { + "epoch": 3.36741214057508, + "grad_norm": 0.11858930438756943, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4216 + }, + { + "epoch": 3.3682108626198084, + "grad_norm": 0.07907772809267044, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4217 + }, + { + "epoch": 3.369009584664537, + "grad_norm": 0.5416387319564819, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 4218 + }, + { + "epoch": 3.369808306709265, + "grad_norm": 0.08767322450876236, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4219 + }, + { + "epoch": 3.3706070287539935, + "grad_norm": 0.09651107341051102, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4220 + }, + { + "epoch": 3.371405750798722, + "grad_norm": 0.07548791915178299, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4221 + }, + { + "epoch": 3.3722044728434506, + "grad_norm": 0.09317605197429657, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4222 + }, + { + "epoch": 3.373003194888179, + "grad_norm": 0.07431582361459732, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4223 + }, + { + "epoch": 3.373801916932907, + "grad_norm": 0.12754018604755402, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4224 + }, + { + "epoch": 3.3746006389776357, + "grad_norm": 0.12697845697402954, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4225 + }, + { + "epoch": 3.3753993610223643, + "grad_norm": 0.21522995829582214, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4226 + }, + { + "epoch": 3.376198083067093, + "grad_norm": 0.08886270225048065, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 4227 + }, + { + "epoch": 3.376996805111821, + "grad_norm": 0.07107655704021454, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4228 + }, + { + "epoch": 3.3777955271565494, + "grad_norm": 0.07452798634767532, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 4229 + }, + { + "epoch": 3.378594249201278, + "grad_norm": 0.10205573588609695, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4230 + }, + { + "epoch": 3.3793929712460065, + "grad_norm": 0.10990341752767563, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4231 + }, + { + "epoch": 3.380191693290735, + "grad_norm": 0.08567643165588379, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4232 + }, + { + "epoch": 3.380990415335463, + "grad_norm": 0.057073548436164856, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4233 + }, + { + "epoch": 3.3817891373801916, + "grad_norm": 0.12602978944778442, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4234 + }, + { + "epoch": 3.38258785942492, + "grad_norm": 0.1715400218963623, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4235 + }, + { + "epoch": 3.3833865814696487, + "grad_norm": 0.13129903376102448, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 4236 + }, + { + "epoch": 3.384185303514377, + "grad_norm": 0.1308225691318512, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4237 + }, + { + "epoch": 3.3849840255591053, + "grad_norm": 0.1353990137577057, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4238 + }, + { + "epoch": 3.385782747603834, + "grad_norm": 0.08648121356964111, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4239 + }, + { + "epoch": 3.3865814696485623, + "grad_norm": 0.23568236827850342, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4240 + }, + { + "epoch": 3.387380191693291, + "grad_norm": 0.20514735579490662, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4241 + }, + { + "epoch": 3.3881789137380194, + "grad_norm": 0.10276424884796143, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4242 + }, + { + "epoch": 3.3889776357827475, + "grad_norm": 0.1838751584291458, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4243 + }, + { + "epoch": 3.389776357827476, + "grad_norm": 0.1697031557559967, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4244 + }, + { + "epoch": 3.3905750798722045, + "grad_norm": 0.11439084261655807, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4245 + }, + { + "epoch": 3.391373801916933, + "grad_norm": 0.14021249115467072, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4246 + }, + { + "epoch": 3.392172523961661, + "grad_norm": 0.13989558815956116, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4247 + }, + { + "epoch": 3.3929712460063897, + "grad_norm": 0.12039095908403397, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4248 + }, + { + "epoch": 3.393769968051118, + "grad_norm": 0.17901045083999634, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4249 + }, + { + "epoch": 3.3945686900958467, + "grad_norm": 0.1053776666522026, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 4250 + }, + { + "epoch": 3.3953674121405752, + "grad_norm": 1.7777512073516846, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4251 + }, + { + "epoch": 3.3961661341853033, + "grad_norm": 0.06677904725074768, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 4252 + }, + { + "epoch": 3.396964856230032, + "grad_norm": 0.16123540699481964, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4253 + }, + { + "epoch": 3.3977635782747604, + "grad_norm": 0.21530884504318237, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4254 + }, + { + "epoch": 3.398562300319489, + "grad_norm": 0.20979386568069458, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4255 + }, + { + "epoch": 3.3993610223642174, + "grad_norm": 0.14755229651927948, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 4256 + }, + { + "epoch": 3.4001597444089455, + "grad_norm": 0.10182930529117584, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4257 + }, + { + "epoch": 3.400958466453674, + "grad_norm": 0.11478064954280853, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4258 + }, + { + "epoch": 3.4017571884984026, + "grad_norm": 0.2052452266216278, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4259 + }, + { + "epoch": 3.402555910543131, + "grad_norm": 0.6292023062705994, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4260 + }, + { + "epoch": 3.4033546325878596, + "grad_norm": 0.0666726678609848, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4261 + }, + { + "epoch": 3.4041533546325877, + "grad_norm": 0.11848346143960953, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4262 + }, + { + "epoch": 3.4049520766773163, + "grad_norm": 0.15276756882667542, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4263 + }, + { + "epoch": 3.405750798722045, + "grad_norm": 0.08534786105155945, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4264 + }, + { + "epoch": 3.4065495207667733, + "grad_norm": 0.07453266531229019, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4265 + }, + { + "epoch": 3.4073482428115014, + "grad_norm": 0.12894752621650696, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4266 + }, + { + "epoch": 3.40814696485623, + "grad_norm": 0.11341612786054611, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4267 + }, + { + "epoch": 3.4089456869009584, + "grad_norm": 0.06551265716552734, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4268 + }, + { + "epoch": 3.409744408945687, + "grad_norm": 0.08828622102737427, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4269 + }, + { + "epoch": 3.4105431309904155, + "grad_norm": 0.06951884925365448, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4270 + }, + { + "epoch": 3.4113418530351436, + "grad_norm": 0.0785432904958725, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4271 + }, + { + "epoch": 3.412140575079872, + "grad_norm": 0.06681766360998154, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4272 + }, + { + "epoch": 3.4129392971246006, + "grad_norm": 0.060111526399850845, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4273 + }, + { + "epoch": 3.413738019169329, + "grad_norm": 0.07451382279396057, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 4274 + }, + { + "epoch": 3.4145367412140573, + "grad_norm": 0.08646225184202194, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4275 + }, + { + "epoch": 3.415335463258786, + "grad_norm": 0.07061789929866791, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4276 + }, + { + "epoch": 3.4161341853035143, + "grad_norm": 0.09554821997880936, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4277 + }, + { + "epoch": 3.416932907348243, + "grad_norm": 0.11288002133369446, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4278 + }, + { + "epoch": 3.4177316293929714, + "grad_norm": 0.10565607994794846, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4279 + }, + { + "epoch": 3.4185303514377, + "grad_norm": 0.08235503733158112, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4280 + }, + { + "epoch": 3.419329073482428, + "grad_norm": 0.1302265226840973, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4281 + }, + { + "epoch": 3.4201277955271565, + "grad_norm": 0.07910848408937454, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4282 + }, + { + "epoch": 3.420926517571885, + "grad_norm": 0.10624215006828308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4283 + }, + { + "epoch": 3.4217252396166136, + "grad_norm": 0.08545158058404922, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4284 + }, + { + "epoch": 3.4225239616613417, + "grad_norm": 0.07010428607463837, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4285 + }, + { + "epoch": 3.42332268370607, + "grad_norm": 0.08256867527961731, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4286 + }, + { + "epoch": 3.4241214057507987, + "grad_norm": 0.13074247539043427, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4287 + }, + { + "epoch": 3.4249201277955272, + "grad_norm": 0.18332679569721222, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4288 + }, + { + "epoch": 3.4257188498402558, + "grad_norm": 0.1671689748764038, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4289 + }, + { + "epoch": 3.426517571884984, + "grad_norm": 0.10386296361684799, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4290 + }, + { + "epoch": 3.4273162939297124, + "grad_norm": 0.07554108649492264, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 4291 + }, + { + "epoch": 3.428115015974441, + "grad_norm": 0.1138196587562561, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4292 + }, + { + "epoch": 3.4289137380191694, + "grad_norm": 0.1681462526321411, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4293 + }, + { + "epoch": 3.4297124600638975, + "grad_norm": 0.1833198368549347, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4294 + }, + { + "epoch": 3.430511182108626, + "grad_norm": 0.10269228368997574, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4295 + }, + { + "epoch": 3.4313099041533546, + "grad_norm": 0.08876223117113113, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4296 + }, + { + "epoch": 3.432108626198083, + "grad_norm": 0.21489253640174866, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4297 + }, + { + "epoch": 3.4329073482428116, + "grad_norm": 0.22669701278209686, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4298 + }, + { + "epoch": 3.43370607028754, + "grad_norm": 0.16946858167648315, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4299 + }, + { + "epoch": 3.4345047923322682, + "grad_norm": 0.05162649229168892, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4300 + }, + { + "epoch": 3.4353035143769968, + "grad_norm": 0.09700657427310944, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4301 + }, + { + "epoch": 3.4361022364217253, + "grad_norm": 0.14858263731002808, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4302 + }, + { + "epoch": 3.436900958466454, + "grad_norm": 0.16938818991184235, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4303 + }, + { + "epoch": 3.437699680511182, + "grad_norm": 0.13441702723503113, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4304 + }, + { + "epoch": 3.4384984025559104, + "grad_norm": 0.07661818712949753, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4305 + }, + { + "epoch": 3.439297124600639, + "grad_norm": 0.19436489045619965, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4306 + }, + { + "epoch": 3.4400958466453675, + "grad_norm": 0.20447906851768494, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4307 + }, + { + "epoch": 3.440894568690096, + "grad_norm": 0.1414622664451599, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4308 + }, + { + "epoch": 3.441693290734824, + "grad_norm": 0.06289447098970413, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4309 + }, + { + "epoch": 3.4424920127795526, + "grad_norm": 0.0966482162475586, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4310 + }, + { + "epoch": 3.443290734824281, + "grad_norm": 0.1300116777420044, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4311 + }, + { + "epoch": 3.4440894568690097, + "grad_norm": 0.11638098210096359, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4312 + }, + { + "epoch": 3.4448881789137378, + "grad_norm": 0.08284632116556168, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4313 + }, + { + "epoch": 3.4456869009584663, + "grad_norm": 0.0617060512304306, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4314 + }, + { + "epoch": 3.446485623003195, + "grad_norm": 0.12798283994197845, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4315 + }, + { + "epoch": 3.4472843450479234, + "grad_norm": 0.12712593376636505, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4316 + }, + { + "epoch": 3.448083067092652, + "grad_norm": 0.09164460003376007, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4317 + }, + { + "epoch": 3.4488817891373804, + "grad_norm": 0.07618964463472366, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4318 + }, + { + "epoch": 3.4496805111821085, + "grad_norm": 0.07986288517713547, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4319 + }, + { + "epoch": 3.450479233226837, + "grad_norm": 0.0783228650689125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4320 + }, + { + "epoch": 3.4512779552715656, + "grad_norm": 0.09899114072322845, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4321 + }, + { + "epoch": 3.452076677316294, + "grad_norm": 0.13710227608680725, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4322 + }, + { + "epoch": 3.452875399361022, + "grad_norm": 0.1281789392232895, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4323 + }, + { + "epoch": 3.4536741214057507, + "grad_norm": 0.11021110415458679, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4324 + }, + { + "epoch": 3.4544728434504792, + "grad_norm": 0.11450989544391632, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4325 + }, + { + "epoch": 3.4552715654952078, + "grad_norm": 0.09010434150695801, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4326 + }, + { + "epoch": 3.4560702875399363, + "grad_norm": 0.08817321807146072, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4327 + }, + { + "epoch": 3.4568690095846644, + "grad_norm": 0.06502921879291534, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4328 + }, + { + "epoch": 3.457667731629393, + "grad_norm": 0.13399769365787506, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4329 + }, + { + "epoch": 3.4584664536741214, + "grad_norm": 0.19785602390766144, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4330 + }, + { + "epoch": 3.45926517571885, + "grad_norm": 0.15761834383010864, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4331 + }, + { + "epoch": 3.460063897763578, + "grad_norm": 0.11824636161327362, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4332 + }, + { + "epoch": 3.4608626198083066, + "grad_norm": 0.07031631469726562, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4333 + }, + { + "epoch": 3.461661341853035, + "grad_norm": 0.09940601140260696, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4334 + }, + { + "epoch": 3.4624600638977636, + "grad_norm": 0.11931589990854263, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4335 + }, + { + "epoch": 3.463258785942492, + "grad_norm": 0.0967932790517807, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4336 + }, + { + "epoch": 3.4640575079872207, + "grad_norm": 0.09523937106132507, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4337 + }, + { + "epoch": 3.4648562300319488, + "grad_norm": 0.09964902698993683, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4338 + }, + { + "epoch": 3.4656549520766773, + "grad_norm": 0.09898022562265396, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4339 + }, + { + "epoch": 3.466453674121406, + "grad_norm": 0.05388521030545235, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4340 + }, + { + "epoch": 3.4672523961661343, + "grad_norm": 0.06455415487289429, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4341 + }, + { + "epoch": 3.4680511182108624, + "grad_norm": 0.05497310310602188, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4342 + }, + { + "epoch": 3.468849840255591, + "grad_norm": 0.049679841846227646, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4343 + }, + { + "epoch": 3.4696485623003195, + "grad_norm": 0.05664939060807228, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4344 + }, + { + "epoch": 3.470447284345048, + "grad_norm": 0.06651245057582855, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4345 + }, + { + "epoch": 3.4712460063897765, + "grad_norm": 0.08480475097894669, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4346 + }, + { + "epoch": 3.4720447284345046, + "grad_norm": 0.07331875711679459, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4347 + }, + { + "epoch": 3.472843450479233, + "grad_norm": 0.0505477711558342, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4348 + }, + { + "epoch": 3.4736421725239617, + "grad_norm": 0.06969176232814789, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4349 + }, + { + "epoch": 3.47444089456869, + "grad_norm": 0.08915391564369202, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4350 + }, + { + "epoch": 3.4752396166134183, + "grad_norm": 0.09378752112388611, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4351 + }, + { + "epoch": 3.476038338658147, + "grad_norm": 0.059195373207330704, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4352 + }, + { + "epoch": 3.4768370607028753, + "grad_norm": 0.07094884663820267, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4353 + }, + { + "epoch": 3.477635782747604, + "grad_norm": 0.11091995984315872, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4354 + }, + { + "epoch": 3.4784345047923324, + "grad_norm": 0.14018885791301727, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4355 + }, + { + "epoch": 3.479233226837061, + "grad_norm": 0.13553708791732788, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4356 + }, + { + "epoch": 3.480031948881789, + "grad_norm": 0.08005240559577942, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4357 + }, + { + "epoch": 3.4808306709265175, + "grad_norm": 0.05309261009097099, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4358 + }, + { + "epoch": 3.481629392971246, + "grad_norm": 0.09956394135951996, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4359 + }, + { + "epoch": 3.4824281150159746, + "grad_norm": 0.13189470767974854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4360 + }, + { + "epoch": 3.4832268370607027, + "grad_norm": 0.13651393353939056, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4361 + }, + { + "epoch": 3.484025559105431, + "grad_norm": 0.12467528879642487, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4362 + }, + { + "epoch": 3.4848242811501597, + "grad_norm": 0.11428561061620712, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4363 + }, + { + "epoch": 3.4856230031948883, + "grad_norm": 0.12095288187265396, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4364 + }, + { + "epoch": 3.486421725239617, + "grad_norm": 0.05889631807804108, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4365 + }, + { + "epoch": 3.487220447284345, + "grad_norm": 0.1158040463924408, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4366 + }, + { + "epoch": 3.4880191693290734, + "grad_norm": 0.11070148646831512, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4367 + }, + { + "epoch": 3.488817891373802, + "grad_norm": 0.0625298023223877, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4368 + }, + { + "epoch": 3.4896166134185305, + "grad_norm": 0.11865562945604324, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4369 + }, + { + "epoch": 3.4904153354632586, + "grad_norm": 0.12237154692411423, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4370 + }, + { + "epoch": 3.491214057507987, + "grad_norm": 0.05703050270676613, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4371 + }, + { + "epoch": 3.4920127795527156, + "grad_norm": 0.17314022779464722, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4372 + }, + { + "epoch": 3.492811501597444, + "grad_norm": 0.2984711825847626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4373 + }, + { + "epoch": 3.4936102236421727, + "grad_norm": 0.30129608511924744, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4374 + }, + { + "epoch": 3.494408945686901, + "grad_norm": 0.12154170870780945, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4375 + }, + { + "epoch": 3.4952076677316293, + "grad_norm": 0.12467148154973984, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4376 + }, + { + "epoch": 3.496006389776358, + "grad_norm": 0.23285721242427826, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4377 + }, + { + "epoch": 3.4968051118210863, + "grad_norm": 0.20723310112953186, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4378 + }, + { + "epoch": 3.497603833865815, + "grad_norm": 0.13221028447151184, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4379 + }, + { + "epoch": 3.498402555910543, + "grad_norm": 0.06008061394095421, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4380 + }, + { + "epoch": 3.4992012779552715, + "grad_norm": 0.12877988815307617, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4381 + }, + { + "epoch": 3.5, + "grad_norm": 0.1951032429933548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4382 + }, + { + "epoch": 3.5007987220447285, + "grad_norm": 0.13804258406162262, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4383 + }, + { + "epoch": 3.501597444089457, + "grad_norm": 0.06761720031499863, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4384 + }, + { + "epoch": 3.502396166134185, + "grad_norm": 0.13217084109783173, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4385 + }, + { + "epoch": 3.5031948881789137, + "grad_norm": 0.11773377656936646, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4386 + }, + { + "epoch": 3.503993610223642, + "grad_norm": 0.07580399513244629, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4387 + }, + { + "epoch": 3.5047923322683707, + "grad_norm": 0.1739586442708969, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4388 + }, + { + "epoch": 3.505591054313099, + "grad_norm": 0.14863203465938568, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4389 + }, + { + "epoch": 3.5063897763578273, + "grad_norm": 0.07858511805534363, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4390 + }, + { + "epoch": 3.507188498402556, + "grad_norm": 0.15966418385505676, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4391 + }, + { + "epoch": 3.5079872204472844, + "grad_norm": 0.28761810064315796, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4392 + }, + { + "epoch": 3.508785942492013, + "grad_norm": 0.24169668555259705, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4393 + }, + { + "epoch": 3.5095846645367414, + "grad_norm": 0.07907059788703918, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4394 + }, + { + "epoch": 3.5103833865814695, + "grad_norm": 0.20243291556835175, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4395 + }, + { + "epoch": 3.511182108626198, + "grad_norm": 0.302198588848114, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4396 + }, + { + "epoch": 3.5119808306709266, + "grad_norm": 0.2544843554496765, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4397 + }, + { + "epoch": 3.512779552715655, + "grad_norm": 0.07381684333086014, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4398 + }, + { + "epoch": 3.513578274760383, + "grad_norm": 0.17388348281383514, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4399 + }, + { + "epoch": 3.5143769968051117, + "grad_norm": 0.2293306440114975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4400 + }, + { + "epoch": 3.5151757188498403, + "grad_norm": 0.07548263669013977, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4401 + }, + { + "epoch": 3.515974440894569, + "grad_norm": 0.1924273669719696, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4402 + }, + { + "epoch": 3.5167731629392973, + "grad_norm": 0.26867300271987915, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4403 + }, + { + "epoch": 3.5175718849840254, + "grad_norm": 0.14461541175842285, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4404 + }, + { + "epoch": 3.518370607028754, + "grad_norm": 0.12608370184898376, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4405 + }, + { + "epoch": 3.5191693290734825, + "grad_norm": 0.20579756796360016, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4406 + }, + { + "epoch": 3.519968051118211, + "grad_norm": 0.12286399304866791, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4407 + }, + { + "epoch": 3.520766773162939, + "grad_norm": 0.055247388780117035, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4408 + }, + { + "epoch": 3.5215654952076676, + "grad_norm": 0.07877562195062637, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4409 + }, + { + "epoch": 3.522364217252396, + "grad_norm": 0.0769568607211113, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4410 + }, + { + "epoch": 3.5231629392971247, + "grad_norm": 0.0898609384894371, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4411 + }, + { + "epoch": 3.523961661341853, + "grad_norm": 0.057637594640254974, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4412 + }, + { + "epoch": 3.5247603833865817, + "grad_norm": 0.12046241015195847, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4413 + }, + { + "epoch": 3.52555910543131, + "grad_norm": 0.09949496388435364, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4414 + }, + { + "epoch": 3.5263578274760383, + "grad_norm": 0.054411277174949646, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4415 + }, + { + "epoch": 3.527156549520767, + "grad_norm": 0.08293551951646805, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4416 + }, + { + "epoch": 3.527955271565495, + "grad_norm": 0.07669435441493988, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4417 + }, + { + "epoch": 3.5287539936102235, + "grad_norm": 0.06382326781749725, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4418 + }, + { + "epoch": 3.529552715654952, + "grad_norm": 0.07673322409391403, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4419 + }, + { + "epoch": 3.5303514376996805, + "grad_norm": 0.08052650839090347, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4420 + }, + { + "epoch": 3.531150159744409, + "grad_norm": 0.1354246884584427, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4421 + }, + { + "epoch": 3.5319488817891376, + "grad_norm": 0.07951574772596359, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 4422 + }, + { + "epoch": 3.5327476038338657, + "grad_norm": 0.11002526432275772, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4423 + }, + { + "epoch": 3.533546325878594, + "grad_norm": 0.18597234785556793, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4424 + }, + { + "epoch": 3.5343450479233227, + "grad_norm": 0.12601099908351898, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4425 + }, + { + "epoch": 3.5351437699680512, + "grad_norm": 0.11181886494159698, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4426 + }, + { + "epoch": 3.5359424920127793, + "grad_norm": 0.11489108949899673, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4427 + }, + { + "epoch": 3.536741214057508, + "grad_norm": 0.10422708839178085, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4428 + }, + { + "epoch": 3.5375399361022364, + "grad_norm": 0.13701972365379333, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4429 + }, + { + "epoch": 3.538338658146965, + "grad_norm": 0.10713281482458115, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4430 + }, + { + "epoch": 3.5391373801916934, + "grad_norm": 0.11508526653051376, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4431 + }, + { + "epoch": 3.539936102236422, + "grad_norm": 0.061856236308813095, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4432 + }, + { + "epoch": 3.54073482428115, + "grad_norm": 0.12080623209476471, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4433 + }, + { + "epoch": 3.5415335463258786, + "grad_norm": 0.12233573198318481, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4434 + }, + { + "epoch": 3.542332268370607, + "grad_norm": 0.07041362673044205, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4435 + }, + { + "epoch": 3.543130990415335, + "grad_norm": 0.1162526085972786, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4436 + }, + { + "epoch": 3.5439297124600637, + "grad_norm": 0.12962234020233154, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4437 + }, + { + "epoch": 3.5447284345047922, + "grad_norm": 0.1368536353111267, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4438 + }, + { + "epoch": 3.5455271565495208, + "grad_norm": 0.061806995421648026, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4439 + }, + { + "epoch": 3.5463258785942493, + "grad_norm": 0.11016163975000381, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4440 + }, + { + "epoch": 3.547124600638978, + "grad_norm": 0.0992715135216713, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4441 + }, + { + "epoch": 3.547923322683706, + "grad_norm": 0.14015190303325653, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4442 + }, + { + "epoch": 3.5487220447284344, + "grad_norm": 0.07255455106496811, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4443 + }, + { + "epoch": 3.549520766773163, + "grad_norm": 0.13293872773647308, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4444 + }, + { + "epoch": 3.5503194888178915, + "grad_norm": 0.08923539519309998, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4445 + }, + { + "epoch": 3.5511182108626196, + "grad_norm": 0.10125918686389923, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4446 + }, + { + "epoch": 3.551916932907348, + "grad_norm": 0.12369748950004578, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4447 + }, + { + "epoch": 3.5527156549520766, + "grad_norm": 0.14656996726989746, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4448 + }, + { + "epoch": 3.553514376996805, + "grad_norm": 0.14212539792060852, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4449 + }, + { + "epoch": 3.5543130990415337, + "grad_norm": 0.08640166372060776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4450 + }, + { + "epoch": 3.5551118210862622, + "grad_norm": 0.05552735924720764, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4451 + }, + { + "epoch": 3.5559105431309903, + "grad_norm": 0.12888140976428986, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4452 + }, + { + "epoch": 3.556709265175719, + "grad_norm": 0.10696940869092941, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4453 + }, + { + "epoch": 3.5575079872204474, + "grad_norm": 0.06578963249921799, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4454 + }, + { + "epoch": 3.5583067092651754, + "grad_norm": 0.16173291206359863, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4455 + }, + { + "epoch": 3.559105431309904, + "grad_norm": 0.1550486832857132, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4456 + }, + { + "epoch": 3.5599041533546325, + "grad_norm": 0.14084209501743317, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4457 + }, + { + "epoch": 3.560702875399361, + "grad_norm": 0.12024512141942978, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4458 + }, + { + "epoch": 3.5615015974440896, + "grad_norm": 0.12514936923980713, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4459 + }, + { + "epoch": 3.562300319488818, + "grad_norm": 0.16444219648838043, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4460 + }, + { + "epoch": 3.563099041533546, + "grad_norm": 0.11520830541849136, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4461 + }, + { + "epoch": 3.5638977635782747, + "grad_norm": 0.07884586602449417, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4462 + }, + { + "epoch": 3.5646964856230032, + "grad_norm": 0.1655684858560562, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4463 + }, + { + "epoch": 3.5654952076677318, + "grad_norm": 0.15222500264644623, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4464 + }, + { + "epoch": 3.56629392971246, + "grad_norm": 0.06106618419289589, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4465 + }, + { + "epoch": 3.5670926517571884, + "grad_norm": 0.10545333474874496, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4466 + }, + { + "epoch": 3.567891373801917, + "grad_norm": 0.1353088915348053, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4467 + }, + { + "epoch": 3.5686900958466454, + "grad_norm": 0.11200091242790222, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4468 + }, + { + "epoch": 3.569488817891374, + "grad_norm": 0.052965741604566574, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4469 + }, + { + "epoch": 3.5702875399361025, + "grad_norm": 0.1244843453168869, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4470 + }, + { + "epoch": 3.5710862619808306, + "grad_norm": 0.1160016730427742, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4471 + }, + { + "epoch": 3.571884984025559, + "grad_norm": 0.04874402657151222, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4472 + }, + { + "epoch": 3.5726837060702876, + "grad_norm": 0.14222301542758942, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4473 + }, + { + "epoch": 3.5734824281150157, + "grad_norm": 0.1190859004855156, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4474 + }, + { + "epoch": 3.5742811501597442, + "grad_norm": 0.0659632682800293, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4475 + }, + { + "epoch": 3.5750798722044728, + "grad_norm": 0.07350483536720276, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4476 + }, + { + "epoch": 3.5758785942492013, + "grad_norm": 0.1220504492521286, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4477 + }, + { + "epoch": 3.57667731629393, + "grad_norm": 0.08952966332435608, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4478 + }, + { + "epoch": 3.5774760383386583, + "grad_norm": 0.08828000724315643, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4479 + }, + { + "epoch": 3.5782747603833864, + "grad_norm": 0.14621564745903015, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4480 + }, + { + "epoch": 3.579073482428115, + "grad_norm": 0.13653770089149475, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4481 + }, + { + "epoch": 3.5798722044728435, + "grad_norm": 0.0682564228773117, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4482 + }, + { + "epoch": 3.580670926517572, + "grad_norm": 0.06511309742927551, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4483 + }, + { + "epoch": 3.5814696485623, + "grad_norm": 0.08800239861011505, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4484 + }, + { + "epoch": 3.5822683706070286, + "grad_norm": 0.06488335877656937, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4485 + }, + { + "epoch": 3.583067092651757, + "grad_norm": 0.06505738198757172, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4486 + }, + { + "epoch": 3.5838658146964857, + "grad_norm": 0.07395542412996292, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4487 + }, + { + "epoch": 3.584664536741214, + "grad_norm": 0.06717971712350845, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4488 + }, + { + "epoch": 3.5854632587859427, + "grad_norm": 0.056708067655563354, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4489 + }, + { + "epoch": 3.586261980830671, + "grad_norm": 0.06316737830638885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4490 + }, + { + "epoch": 3.5870607028753994, + "grad_norm": 0.06079665198922157, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4491 + }, + { + "epoch": 3.587859424920128, + "grad_norm": 0.1293981820344925, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4492 + }, + { + "epoch": 3.588658146964856, + "grad_norm": 0.08021418750286102, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4493 + }, + { + "epoch": 3.5894568690095845, + "grad_norm": 0.096865214407444, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4494 + }, + { + "epoch": 3.590255591054313, + "grad_norm": 0.06794966757297516, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4495 + }, + { + "epoch": 3.5910543130990416, + "grad_norm": 0.04527222737669945, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4496 + }, + { + "epoch": 3.59185303514377, + "grad_norm": 0.07153941690921783, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4497 + }, + { + "epoch": 3.5926517571884986, + "grad_norm": 0.07480445504188538, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4498 + }, + { + "epoch": 3.5934504792332267, + "grad_norm": 0.09161835163831711, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4499 + }, + { + "epoch": 3.594249201277955, + "grad_norm": 0.08420681953430176, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4500 + }, + { + "epoch": 3.5950479233226837, + "grad_norm": 0.04745415225625038, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4501 + }, + { + "epoch": 3.5958466453674123, + "grad_norm": 0.061325494199991226, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4502 + }, + { + "epoch": 3.5966453674121404, + "grad_norm": 0.08550430834293365, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4503 + }, + { + "epoch": 3.597444089456869, + "grad_norm": 0.09530419111251831, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4504 + }, + { + "epoch": 3.5982428115015974, + "grad_norm": 0.10484769195318222, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4505 + }, + { + "epoch": 3.599041533546326, + "grad_norm": 0.08398665487766266, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4506 + }, + { + "epoch": 3.5998402555910545, + "grad_norm": 0.1644149124622345, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4507 + }, + { + "epoch": 3.600638977635783, + "grad_norm": 0.0803244560956955, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4508 + }, + { + "epoch": 3.601437699680511, + "grad_norm": 0.12512895464897156, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4509 + }, + { + "epoch": 3.6022364217252396, + "grad_norm": 0.1404576301574707, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4510 + }, + { + "epoch": 3.603035143769968, + "grad_norm": 0.10823316127061844, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4511 + }, + { + "epoch": 3.6038338658146962, + "grad_norm": 0.06985688954591751, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4512 + }, + { + "epoch": 3.6046325878594248, + "grad_norm": 0.1651264876127243, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4513 + }, + { + "epoch": 3.6054313099041533, + "grad_norm": 0.19752484560012817, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4514 + }, + { + "epoch": 3.606230031948882, + "grad_norm": 0.20005464553833008, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4515 + }, + { + "epoch": 3.6070287539936103, + "grad_norm": 0.1478145569562912, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4516 + }, + { + "epoch": 3.607827476038339, + "grad_norm": 0.05737901106476784, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4517 + }, + { + "epoch": 3.608626198083067, + "grad_norm": 0.16174650192260742, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4518 + }, + { + "epoch": 3.6094249201277955, + "grad_norm": 0.1959141194820404, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4519 + }, + { + "epoch": 3.610223642172524, + "grad_norm": 0.09767267853021622, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4520 + }, + { + "epoch": 3.6110223642172525, + "grad_norm": 0.10553760081529617, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4521 + }, + { + "epoch": 3.6118210862619806, + "grad_norm": 0.19380977749824524, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4522 + }, + { + "epoch": 3.612619808306709, + "grad_norm": 0.2024526745080948, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4523 + }, + { + "epoch": 3.6134185303514377, + "grad_norm": 0.09705837070941925, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4524 + }, + { + "epoch": 3.614217252396166, + "grad_norm": 0.12530986964702606, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4525 + }, + { + "epoch": 3.6150159744408947, + "grad_norm": 0.20901283621788025, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4526 + }, + { + "epoch": 3.6158146964856233, + "grad_norm": 0.16532309353351593, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4527 + }, + { + "epoch": 3.6166134185303513, + "grad_norm": 0.18353991210460663, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4528 + }, + { + "epoch": 3.61741214057508, + "grad_norm": 0.12912365794181824, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4529 + }, + { + "epoch": 3.6182108626198084, + "grad_norm": 0.2052653580904007, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4530 + }, + { + "epoch": 3.6190095846645365, + "grad_norm": 0.1395503133535385, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4531 + }, + { + "epoch": 3.619808306709265, + "grad_norm": 0.07939961552619934, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4532 + }, + { + "epoch": 3.6206070287539935, + "grad_norm": 0.10098318755626678, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4533 + }, + { + "epoch": 3.621405750798722, + "grad_norm": 0.14332561194896698, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4534 + }, + { + "epoch": 3.6222044728434506, + "grad_norm": 0.09697199612855911, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4535 + }, + { + "epoch": 3.623003194888179, + "grad_norm": 0.07785658538341522, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4536 + }, + { + "epoch": 3.623801916932907, + "grad_norm": 0.11263108998537064, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4537 + }, + { + "epoch": 3.6246006389776357, + "grad_norm": 0.18257030844688416, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4538 + }, + { + "epoch": 3.6253993610223643, + "grad_norm": 0.1456373631954193, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4539 + }, + { + "epoch": 3.626198083067093, + "grad_norm": 0.06831679493188858, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4540 + }, + { + "epoch": 3.626996805111821, + "grad_norm": 0.12324535846710205, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4541 + }, + { + "epoch": 3.6277955271565494, + "grad_norm": 0.15868282318115234, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4542 + }, + { + "epoch": 3.628594249201278, + "grad_norm": 0.09355167299509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4543 + }, + { + "epoch": 3.6293929712460065, + "grad_norm": 0.08047328144311905, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4544 + }, + { + "epoch": 3.630191693290735, + "grad_norm": 0.12683328986167908, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4545 + }, + { + "epoch": 3.6309904153354635, + "grad_norm": 0.11964920908212662, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4546 + }, + { + "epoch": 3.6317891373801916, + "grad_norm": 0.0504109226167202, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4547 + }, + { + "epoch": 3.63258785942492, + "grad_norm": 0.11909852921962738, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4548 + }, + { + "epoch": 3.6333865814696487, + "grad_norm": 0.16763992607593536, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4549 + }, + { + "epoch": 3.6341853035143767, + "grad_norm": 0.1486649513244629, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4550 + }, + { + "epoch": 3.6349840255591053, + "grad_norm": 0.06941305845975876, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4551 + }, + { + "epoch": 3.635782747603834, + "grad_norm": 0.1177566722035408, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4552 + }, + { + "epoch": 3.6365814696485623, + "grad_norm": 0.23368601500988007, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4553 + }, + { + "epoch": 3.637380191693291, + "grad_norm": 0.24657249450683594, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4554 + }, + { + "epoch": 3.6381789137380194, + "grad_norm": 0.10063605010509491, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4555 + }, + { + "epoch": 3.6389776357827475, + "grad_norm": 0.1553603708744049, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4556 + }, + { + "epoch": 3.639776357827476, + "grad_norm": 0.25588107109069824, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4557 + }, + { + "epoch": 3.6405750798722045, + "grad_norm": 0.15270236134529114, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4558 + }, + { + "epoch": 3.641373801916933, + "grad_norm": 0.108666330575943, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4559 + }, + { + "epoch": 3.642172523961661, + "grad_norm": 0.19828133285045624, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4560 + }, + { + "epoch": 3.6429712460063897, + "grad_norm": 0.21500051021575928, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4561 + }, + { + "epoch": 3.643769968051118, + "grad_norm": 0.16299934685230255, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4562 + }, + { + "epoch": 3.6445686900958467, + "grad_norm": 0.07390763610601425, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4563 + }, + { + "epoch": 3.6453674121405752, + "grad_norm": 0.22709119319915771, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4564 + }, + { + "epoch": 3.6461661341853038, + "grad_norm": 0.15557943284511566, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4565 + }, + { + "epoch": 3.646964856230032, + "grad_norm": 0.062457580119371414, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4566 + }, + { + "epoch": 3.6477635782747604, + "grad_norm": 0.09101095795631409, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4567 + }, + { + "epoch": 3.648562300319489, + "grad_norm": 0.08700825273990631, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 4568 + }, + { + "epoch": 3.649361022364217, + "grad_norm": 0.058703795075416565, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4569 + }, + { + "epoch": 3.6501597444089455, + "grad_norm": 0.056776538491249084, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4570 + }, + { + "epoch": 3.650958466453674, + "grad_norm": 0.062245409935712814, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4571 + }, + { + "epoch": 3.6517571884984026, + "grad_norm": 0.0534074492752552, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 4572 + }, + { + "epoch": 3.652555910543131, + "grad_norm": 0.09061384946107864, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4573 + }, + { + "epoch": 3.6533546325878596, + "grad_norm": 0.07323598116636276, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4574 + }, + { + "epoch": 3.6541533546325877, + "grad_norm": 0.1120329350233078, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 4575 + }, + { + "epoch": 3.6549520766773163, + "grad_norm": 0.07965485006570816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4576 + }, + { + "epoch": 3.655750798722045, + "grad_norm": 0.06320462375879288, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4577 + }, + { + "epoch": 3.6565495207667733, + "grad_norm": 0.07869421690702438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4578 + }, + { + "epoch": 3.6573482428115014, + "grad_norm": 0.09003151208162308, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4579 + }, + { + "epoch": 3.65814696485623, + "grad_norm": 0.05570388212800026, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4580 + }, + { + "epoch": 3.6589456869009584, + "grad_norm": 0.15563733875751495, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4581 + }, + { + "epoch": 3.659744408945687, + "grad_norm": 0.1422414481639862, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4582 + }, + { + "epoch": 3.6605431309904155, + "grad_norm": 0.13704177737236023, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4583 + }, + { + "epoch": 3.661341853035144, + "grad_norm": 0.36126458644866943, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4584 + }, + { + "epoch": 3.662140575079872, + "grad_norm": 0.09024632722139359, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4585 + }, + { + "epoch": 3.6629392971246006, + "grad_norm": 0.07135412096977234, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4586 + }, + { + "epoch": 3.663738019169329, + "grad_norm": 0.06172417849302292, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4587 + }, + { + "epoch": 3.6645367412140573, + "grad_norm": 0.05962595343589783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4588 + }, + { + "epoch": 3.665335463258786, + "grad_norm": 0.07063078880310059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4589 + }, + { + "epoch": 3.6661341853035143, + "grad_norm": 0.1445596069097519, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4590 + }, + { + "epoch": 3.666932907348243, + "grad_norm": 0.09224060922861099, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4591 + }, + { + "epoch": 3.6677316293929714, + "grad_norm": 0.10353037714958191, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4592 + }, + { + "epoch": 3.6685303514377, + "grad_norm": 0.10922796279191971, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4593 + }, + { + "epoch": 3.669329073482428, + "grad_norm": 0.08728764951229095, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4594 + }, + { + "epoch": 3.6701277955271565, + "grad_norm": 0.0639081671833992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4595 + }, + { + "epoch": 3.670926517571885, + "grad_norm": 0.050491299480199814, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4596 + }, + { + "epoch": 3.6717252396166136, + "grad_norm": 0.07127548009157181, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4597 + }, + { + "epoch": 3.6725239616613417, + "grad_norm": 0.05432606860995293, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4598 + }, + { + "epoch": 3.67332268370607, + "grad_norm": 0.0653342455625534, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4599 + }, + { + "epoch": 3.6741214057507987, + "grad_norm": 0.08766797184944153, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4600 + }, + { + "epoch": 3.6749201277955272, + "grad_norm": 0.0816602036356926, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4601 + }, + { + "epoch": 3.6757188498402558, + "grad_norm": 0.08774783462285995, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4602 + }, + { + "epoch": 3.6765175718849843, + "grad_norm": 0.07776570320129395, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4603 + }, + { + "epoch": 3.6773162939297124, + "grad_norm": 0.07067213952541351, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4604 + }, + { + "epoch": 3.678115015974441, + "grad_norm": 0.06581863760948181, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4605 + }, + { + "epoch": 3.6789137380191694, + "grad_norm": 0.08631278574466705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4606 + }, + { + "epoch": 3.6797124600638975, + "grad_norm": 0.10875384509563446, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4607 + }, + { + "epoch": 3.680511182108626, + "grad_norm": 0.11207764595746994, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4608 + }, + { + "epoch": 3.6813099041533546, + "grad_norm": 0.08943730592727661, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4609 + }, + { + "epoch": 3.682108626198083, + "grad_norm": 0.1922001987695694, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4610 + }, + { + "epoch": 3.6829073482428116, + "grad_norm": 0.10121189057826996, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4611 + }, + { + "epoch": 3.68370607028754, + "grad_norm": 0.05991055443882942, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4612 + }, + { + "epoch": 3.6845047923322682, + "grad_norm": 0.0897853821516037, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4613 + }, + { + "epoch": 3.6853035143769968, + "grad_norm": 0.13160353899002075, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4614 + }, + { + "epoch": 3.6861022364217253, + "grad_norm": 0.13855913281440735, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4615 + }, + { + "epoch": 3.686900958466454, + "grad_norm": 0.11086787283420563, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4616 + }, + { + "epoch": 3.687699680511182, + "grad_norm": 0.07992085069417953, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4617 + }, + { + "epoch": 3.6884984025559104, + "grad_norm": 0.11618958413600922, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4618 + }, + { + "epoch": 3.689297124600639, + "grad_norm": 0.19551296532154083, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4619 + }, + { + "epoch": 3.6900958466453675, + "grad_norm": 0.20239807665348053, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4620 + }, + { + "epoch": 3.690894568690096, + "grad_norm": 0.13233833014965057, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4621 + }, + { + "epoch": 3.6916932907348246, + "grad_norm": 0.08789848536252975, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4622 + }, + { + "epoch": 3.6924920127795526, + "grad_norm": 0.2363075315952301, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4623 + }, + { + "epoch": 3.693290734824281, + "grad_norm": 0.2585245668888092, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4624 + }, + { + "epoch": 3.6940894568690097, + "grad_norm": 0.15822109580039978, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4625 + }, + { + "epoch": 3.6948881789137378, + "grad_norm": 0.07197296619415283, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4626 + }, + { + "epoch": 3.6956869009584663, + "grad_norm": 0.21067900955677032, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4627 + }, + { + "epoch": 3.696485623003195, + "grad_norm": 0.19520802795886993, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4628 + }, + { + "epoch": 3.6972843450479234, + "grad_norm": 0.08310793340206146, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4629 + }, + { + "epoch": 3.698083067092652, + "grad_norm": 0.2118932604789734, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4630 + }, + { + "epoch": 3.6988817891373804, + "grad_norm": 0.2236505001783371, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4631 + }, + { + "epoch": 3.6996805111821085, + "grad_norm": 0.16256077587604523, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4632 + }, + { + "epoch": 3.700479233226837, + "grad_norm": 0.14406970143318176, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4633 + }, + { + "epoch": 3.7012779552715656, + "grad_norm": 0.09738676995038986, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4634 + }, + { + "epoch": 3.702076677316294, + "grad_norm": 0.07531408965587616, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4635 + }, + { + "epoch": 3.702875399361022, + "grad_norm": 0.11631188541650772, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4636 + }, + { + "epoch": 3.7036741214057507, + "grad_norm": 0.11661874502897263, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4637 + }, + { + "epoch": 3.7044728434504792, + "grad_norm": 0.11709950119256973, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 4638 + }, + { + "epoch": 3.7052715654952078, + "grad_norm": 0.13420704007148743, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4639 + }, + { + "epoch": 3.7060702875399363, + "grad_norm": 0.08842958509922028, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4640 + }, + { + "epoch": 3.706869009584665, + "grad_norm": 0.07295326143503189, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4641 + }, + { + "epoch": 3.707667731629393, + "grad_norm": 0.14573390781879425, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4642 + }, + { + "epoch": 3.7084664536741214, + "grad_norm": 0.06639868766069412, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4643 + }, + { + "epoch": 3.70926517571885, + "grad_norm": 0.05936001241207123, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4644 + }, + { + "epoch": 3.710063897763578, + "grad_norm": 0.06534209847450256, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4645 + }, + { + "epoch": 3.7108626198083066, + "grad_norm": 0.13101834058761597, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4646 + }, + { + "epoch": 3.711661341853035, + "grad_norm": 0.07707498222589493, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4647 + }, + { + "epoch": 3.7124600638977636, + "grad_norm": 0.09272165596485138, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4648 + }, + { + "epoch": 3.713258785942492, + "grad_norm": 0.12538838386535645, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4649 + }, + { + "epoch": 3.7140575079872207, + "grad_norm": 0.10816318541765213, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4650 + }, + { + "epoch": 3.7148562300319488, + "grad_norm": 0.10610290616750717, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4651 + }, + { + "epoch": 3.7156549520766773, + "grad_norm": 0.09520592540502548, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4652 + }, + { + "epoch": 3.716453674121406, + "grad_norm": 0.05595150217413902, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4653 + }, + { + "epoch": 3.7172523961661343, + "grad_norm": 0.08114545047283173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4654 + }, + { + "epoch": 3.7180511182108624, + "grad_norm": 0.16090086102485657, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4655 + }, + { + "epoch": 3.718849840255591, + "grad_norm": 0.16332058608531952, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4656 + }, + { + "epoch": 3.7196485623003195, + "grad_norm": 0.17694437503814697, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4657 + }, + { + "epoch": 3.720447284345048, + "grad_norm": 0.16341771185398102, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4658 + }, + { + "epoch": 3.7212460063897765, + "grad_norm": 0.12268038839101791, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4659 + }, + { + "epoch": 3.722044728434505, + "grad_norm": 0.09971031546592712, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4660 + }, + { + "epoch": 3.722843450479233, + "grad_norm": 0.08546486496925354, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4661 + }, + { + "epoch": 3.7236421725239617, + "grad_norm": 0.15427617728710175, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4662 + }, + { + "epoch": 3.72444089456869, + "grad_norm": 0.1291000247001648, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4663 + }, + { + "epoch": 3.7252396166134183, + "grad_norm": 0.06823746860027313, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4664 + }, + { + "epoch": 3.726038338658147, + "grad_norm": 0.08133388310670853, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4665 + }, + { + "epoch": 3.7268370607028753, + "grad_norm": 0.08803416788578033, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4666 + }, + { + "epoch": 3.727635782747604, + "grad_norm": 0.05898858234286308, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4667 + }, + { + "epoch": 3.7284345047923324, + "grad_norm": 0.07650687545537949, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 4668 + }, + { + "epoch": 3.729233226837061, + "grad_norm": 0.15048138797283173, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4669 + }, + { + "epoch": 3.730031948881789, + "grad_norm": 0.08594254404306412, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4670 + }, + { + "epoch": 3.7308306709265175, + "grad_norm": 0.05322937294840813, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4671 + }, + { + "epoch": 3.731629392971246, + "grad_norm": 0.14541727304458618, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4672 + }, + { + "epoch": 3.7324281150159746, + "grad_norm": 0.10300826281309128, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4673 + }, + { + "epoch": 3.7332268370607027, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4674 + }, + { + "epoch": 3.734025559105431, + "grad_norm": 0.07101032137870789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4675 + }, + { + "epoch": 3.7348242811501597, + "grad_norm": 0.09166763722896576, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4676 + }, + { + "epoch": 3.7356230031948883, + "grad_norm": 0.06929054856300354, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4677 + }, + { + "epoch": 3.736421725239617, + "grad_norm": 0.05935844033956528, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4678 + }, + { + "epoch": 3.737220447284345, + "grad_norm": 0.09101571142673492, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4679 + }, + { + "epoch": 3.7380191693290734, + "grad_norm": 0.0979514792561531, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4680 + }, + { + "epoch": 3.738817891373802, + "grad_norm": 0.07105522602796555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4681 + }, + { + "epoch": 3.7396166134185305, + "grad_norm": 0.05741708725690842, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4682 + }, + { + "epoch": 3.7404153354632586, + "grad_norm": 0.051515400409698486, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4683 + }, + { + "epoch": 3.741214057507987, + "grad_norm": 0.06484496593475342, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4684 + }, + { + "epoch": 3.7420127795527156, + "grad_norm": 0.056751761585474014, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4685 + }, + { + "epoch": 3.742811501597444, + "grad_norm": 0.09628041833639145, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4686 + }, + { + "epoch": 3.7436102236421727, + "grad_norm": 0.13367851078510284, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 4687 + }, + { + "epoch": 3.744408945686901, + "grad_norm": 0.10439570248126984, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4688 + }, + { + "epoch": 3.7452076677316293, + "grad_norm": 0.05516012758016586, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4689 + }, + { + "epoch": 3.746006389776358, + "grad_norm": 0.0721910372376442, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 4690 + }, + { + "epoch": 3.7468051118210863, + "grad_norm": 0.10327166318893433, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4691 + }, + { + "epoch": 3.747603833865815, + "grad_norm": 0.10419414937496185, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4692 + }, + { + "epoch": 3.748402555910543, + "grad_norm": 0.07322157919406891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4693 + }, + { + "epoch": 3.7492012779552715, + "grad_norm": 0.05000368133187294, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4694 + }, + { + "epoch": 3.75, + "grad_norm": 0.055239707231521606, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4695 + }, + { + "epoch": 3.7507987220447285, + "grad_norm": 0.14060117304325104, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 4696 + }, + { + "epoch": 3.751597444089457, + "grad_norm": 0.1366022527217865, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4697 + }, + { + "epoch": 3.752396166134185, + "grad_norm": 0.15003731846809387, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4698 + }, + { + "epoch": 3.7531948881789137, + "grad_norm": 0.11602472513914108, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4699 + }, + { + "epoch": 3.753993610223642, + "grad_norm": 0.06956090778112411, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4700 + }, + { + "epoch": 3.7547923322683707, + "grad_norm": 0.04711974412202835, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4701 + }, + { + "epoch": 3.755591054313099, + "grad_norm": 0.09257466346025467, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4702 + }, + { + "epoch": 3.7563897763578273, + "grad_norm": 0.06598426401615143, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4703 + }, + { + "epoch": 3.757188498402556, + "grad_norm": 0.06239036098122597, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4704 + }, + { + "epoch": 3.7579872204472844, + "grad_norm": 0.10065969824790955, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4705 + }, + { + "epoch": 3.758785942492013, + "grad_norm": 0.12874993681907654, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4706 + }, + { + "epoch": 3.7595846645367414, + "grad_norm": 0.10291960090398788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4707 + }, + { + "epoch": 3.7603833865814695, + "grad_norm": 0.06138000637292862, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4708 + }, + { + "epoch": 3.761182108626198, + "grad_norm": 0.11565262079238892, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4709 + }, + { + "epoch": 3.7619808306709266, + "grad_norm": 0.08041521906852722, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4710 + }, + { + "epoch": 3.762779552715655, + "grad_norm": 0.07228218764066696, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4711 + }, + { + "epoch": 3.763578274760383, + "grad_norm": 0.09155906736850739, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4712 + }, + { + "epoch": 3.7643769968051117, + "grad_norm": 0.07468429207801819, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4713 + }, + { + "epoch": 3.7651757188498403, + "grad_norm": 0.07629574090242386, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4714 + }, + { + "epoch": 3.765974440894569, + "grad_norm": 0.1118689477443695, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4715 + }, + { + "epoch": 3.7667731629392973, + "grad_norm": 0.07983580976724625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4716 + }, + { + "epoch": 3.7675718849840254, + "grad_norm": 0.07225694507360458, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4717 + }, + { + "epoch": 3.768370607028754, + "grad_norm": 0.1322079598903656, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4718 + }, + { + "epoch": 3.7691693290734825, + "grad_norm": 0.17217211425304413, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4719 + }, + { + "epoch": 3.769968051118211, + "grad_norm": 0.14665336906909943, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4720 + }, + { + "epoch": 3.770766773162939, + "grad_norm": 0.09977035969495773, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4721 + }, + { + "epoch": 3.7715654952076676, + "grad_norm": 0.1346946358680725, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4722 + }, + { + "epoch": 3.772364217252396, + "grad_norm": 0.17330871522426605, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 4723 + }, + { + "epoch": 3.7731629392971247, + "grad_norm": 0.17789506912231445, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4724 + }, + { + "epoch": 3.773961661341853, + "grad_norm": 0.06285518407821655, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4725 + }, + { + "epoch": 3.7747603833865817, + "grad_norm": 0.13192926347255707, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4726 + }, + { + "epoch": 3.77555910543131, + "grad_norm": 0.12157132476568222, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4727 + }, + { + "epoch": 3.7763578274760383, + "grad_norm": 0.1203337088227272, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4728 + }, + { + "epoch": 3.777156549520767, + "grad_norm": 0.16711866855621338, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4729 + }, + { + "epoch": 3.777955271565495, + "grad_norm": 0.13596504926681519, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4730 + }, + { + "epoch": 3.7787539936102235, + "grad_norm": 0.13502761721611023, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4731 + }, + { + "epoch": 3.779552715654952, + "grad_norm": 0.0751141607761383, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4732 + }, + { + "epoch": 3.7803514376996805, + "grad_norm": 0.1104620099067688, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4733 + }, + { + "epoch": 3.781150159744409, + "grad_norm": 0.06397949904203415, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4734 + }, + { + "epoch": 3.7819488817891376, + "grad_norm": 0.07850230485200882, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 4735 + }, + { + "epoch": 3.7827476038338657, + "grad_norm": 0.10330549627542496, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4736 + }, + { + "epoch": 3.783546325878594, + "grad_norm": 0.08978938311338425, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4737 + }, + { + "epoch": 3.7843450479233227, + "grad_norm": 0.07073058933019638, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4738 + }, + { + "epoch": 3.7851437699680512, + "grad_norm": 0.05997786670923233, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4739 + }, + { + "epoch": 3.7859424920127793, + "grad_norm": 0.0779404565691948, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4740 + }, + { + "epoch": 3.786741214057508, + "grad_norm": 0.1367640644311905, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4741 + }, + { + "epoch": 3.7875399361022364, + "grad_norm": 0.08670534938573837, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4742 + }, + { + "epoch": 3.788338658146965, + "grad_norm": 0.08612547069787979, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4743 + }, + { + "epoch": 3.7891373801916934, + "grad_norm": 0.06312929093837738, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4744 + }, + { + "epoch": 3.789936102236422, + "grad_norm": 0.06397293508052826, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4745 + }, + { + "epoch": 3.79073482428115, + "grad_norm": 0.0663115605711937, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4746 + }, + { + "epoch": 3.7915335463258786, + "grad_norm": 0.07580576092004776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4747 + }, + { + "epoch": 3.792332268370607, + "grad_norm": 0.12604761123657227, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4748 + }, + { + "epoch": 3.793130990415335, + "grad_norm": 0.08900050073862076, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4749 + }, + { + "epoch": 3.7939297124600637, + "grad_norm": 0.09280730038881302, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4750 + }, + { + "epoch": 3.7947284345047922, + "grad_norm": 0.17689163982868195, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 4751 + }, + { + "epoch": 3.7955271565495208, + "grad_norm": 0.06348183006048203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4752 + }, + { + "epoch": 3.7963258785942493, + "grad_norm": 0.12626387178897858, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 4753 + }, + { + "epoch": 3.797124600638978, + "grad_norm": 0.1138390377163887, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4754 + }, + { + "epoch": 3.797923322683706, + "grad_norm": 0.08058728277683258, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4755 + }, + { + "epoch": 3.7987220447284344, + "grad_norm": 0.09671882539987564, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4756 + }, + { + "epoch": 3.799520766773163, + "grad_norm": 0.12193922698497772, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4757 + }, + { + "epoch": 3.8003194888178915, + "grad_norm": 0.31105268001556396, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4758 + }, + { + "epoch": 3.8011182108626196, + "grad_norm": 0.10482051223516464, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4759 + }, + { + "epoch": 3.801916932907348, + "grad_norm": 0.09116382896900177, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4760 + }, + { + "epoch": 3.8027156549520766, + "grad_norm": 0.08212421089410782, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4761 + }, + { + "epoch": 3.803514376996805, + "grad_norm": 0.08267461508512497, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4762 + }, + { + "epoch": 3.8043130990415337, + "grad_norm": 0.13247907161712646, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4763 + }, + { + "epoch": 3.8051118210862622, + "grad_norm": 0.1083490327000618, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 4764 + }, + { + "epoch": 3.8059105431309903, + "grad_norm": 0.11947019398212433, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4765 + }, + { + "epoch": 3.806709265175719, + "grad_norm": 0.08462221175432205, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4766 + }, + { + "epoch": 3.8075079872204474, + "grad_norm": 0.07244928181171417, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4767 + }, + { + "epoch": 3.8083067092651754, + "grad_norm": 0.13432611525058746, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4768 + }, + { + "epoch": 3.809105431309904, + "grad_norm": 0.16640888154506683, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4769 + }, + { + "epoch": 3.8099041533546325, + "grad_norm": 0.12189232558012009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4770 + }, + { + "epoch": 3.810702875399361, + "grad_norm": 0.052367180585861206, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4771 + }, + { + "epoch": 3.8115015974440896, + "grad_norm": 0.10426424443721771, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4772 + }, + { + "epoch": 3.812300319488818, + "grad_norm": 0.11365417391061783, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4773 + }, + { + "epoch": 3.813099041533546, + "grad_norm": 0.07064168155193329, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4774 + }, + { + "epoch": 3.8138977635782747, + "grad_norm": 0.2107549011707306, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4775 + }, + { + "epoch": 3.8146964856230032, + "grad_norm": 0.2984449565410614, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4776 + }, + { + "epoch": 3.8154952076677318, + "grad_norm": 0.26252058148384094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4777 + }, + { + "epoch": 3.81629392971246, + "grad_norm": 0.08128907531499863, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4778 + }, + { + "epoch": 3.8170926517571884, + "grad_norm": 0.2724008858203888, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4779 + }, + { + "epoch": 3.817891373801917, + "grad_norm": 0.2646482288837433, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4780 + }, + { + "epoch": 3.8186900958466454, + "grad_norm": 0.16063876450061798, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4781 + }, + { + "epoch": 3.819488817891374, + "grad_norm": 0.11671862006187439, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4782 + }, + { + "epoch": 3.8202875399361025, + "grad_norm": 0.21605245769023895, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4783 + }, + { + "epoch": 3.8210862619808306, + "grad_norm": 0.17344583570957184, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4784 + }, + { + "epoch": 3.821884984025559, + "grad_norm": 0.08113347738981247, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4785 + }, + { + "epoch": 3.8226837060702876, + "grad_norm": 0.11774581670761108, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4786 + }, + { + "epoch": 3.8234824281150157, + "grad_norm": 0.2024560272693634, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4787 + }, + { + "epoch": 3.8242811501597442, + "grad_norm": 0.5578162670135498, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 4788 + }, + { + "epoch": 3.8250798722044728, + "grad_norm": 0.10354574024677277, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4789 + }, + { + "epoch": 3.8258785942492013, + "grad_norm": 0.14583979547023773, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4790 + }, + { + "epoch": 3.82667731629393, + "grad_norm": 0.15853755176067352, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4791 + }, + { + "epoch": 3.8274760383386583, + "grad_norm": 0.1308104395866394, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4792 + }, + { + "epoch": 3.8282747603833864, + "grad_norm": 0.04385368898510933, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4793 + }, + { + "epoch": 3.829073482428115, + "grad_norm": 0.16213825345039368, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4794 + }, + { + "epoch": 3.8298722044728435, + "grad_norm": 0.2693546414375305, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4795 + }, + { + "epoch": 3.830670926517572, + "grad_norm": 0.23904170095920563, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4796 + }, + { + "epoch": 3.8314696485623, + "grad_norm": 0.11313450336456299, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4797 + }, + { + "epoch": 3.8322683706070286, + "grad_norm": 0.0770820751786232, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4798 + }, + { + "epoch": 3.833067092651757, + "grad_norm": 0.8537606596946716, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4799 + }, + { + "epoch": 3.8338658146964857, + "grad_norm": 0.13684043288230896, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4800 + }, + { + "epoch": 3.834664536741214, + "grad_norm": 0.0890694409608841, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 4801 + }, + { + "epoch": 3.8354632587859427, + "grad_norm": 0.060917336493730545, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4802 + }, + { + "epoch": 3.836261980830671, + "grad_norm": 0.13864673674106598, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4803 + }, + { + "epoch": 3.8370607028753994, + "grad_norm": 0.15316139161586761, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4804 + }, + { + "epoch": 3.837859424920128, + "grad_norm": 0.061508018523454666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4805 + }, + { + "epoch": 3.838658146964856, + "grad_norm": 0.126112699508667, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4806 + }, + { + "epoch": 3.8394568690095845, + "grad_norm": 0.1663133054971695, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4807 + }, + { + "epoch": 3.840255591054313, + "grad_norm": 0.14435894787311554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4808 + }, + { + "epoch": 3.8410543130990416, + "grad_norm": 0.06042332574725151, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4809 + }, + { + "epoch": 3.84185303514377, + "grad_norm": 0.12759631872177124, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4810 + }, + { + "epoch": 3.8426517571884986, + "grad_norm": 0.18153302371501923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4811 + }, + { + "epoch": 3.8434504792332267, + "grad_norm": 0.1280708760023117, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4812 + }, + { + "epoch": 3.844249201277955, + "grad_norm": 0.07144157588481903, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4813 + }, + { + "epoch": 3.8450479233226837, + "grad_norm": 0.13078796863555908, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4814 + }, + { + "epoch": 3.8458466453674123, + "grad_norm": 0.16230762004852295, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4815 + }, + { + "epoch": 3.8466453674121404, + "grad_norm": 0.10997766256332397, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4816 + }, + { + "epoch": 3.847444089456869, + "grad_norm": 0.06006971001625061, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4817 + }, + { + "epoch": 3.8482428115015974, + "grad_norm": 0.10155797749757767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4818 + }, + { + "epoch": 3.849041533546326, + "grad_norm": 0.11125919967889786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4819 + }, + { + "epoch": 3.8498402555910545, + "grad_norm": 0.0860416367650032, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 4820 + }, + { + "epoch": 3.850638977635783, + "grad_norm": 0.0862870067358017, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4821 + }, + { + "epoch": 3.851437699680511, + "grad_norm": 0.07229744642972946, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4822 + }, + { + "epoch": 3.8522364217252396, + "grad_norm": 0.10448424518108368, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4823 + }, + { + "epoch": 3.853035143769968, + "grad_norm": 0.08971705287694931, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 4824 + }, + { + "epoch": 3.8538338658146962, + "grad_norm": 0.09876695275306702, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4825 + }, + { + "epoch": 3.8546325878594248, + "grad_norm": 0.0667971819639206, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4826 + }, + { + "epoch": 3.8554313099041533, + "grad_norm": 0.14437620341777802, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4827 + }, + { + "epoch": 3.856230031948882, + "grad_norm": 0.17627735435962677, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4828 + }, + { + "epoch": 3.8570287539936103, + "grad_norm": 0.10524439066648483, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4829 + }, + { + "epoch": 3.857827476038339, + "grad_norm": 0.15091893076896667, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4830 + }, + { + "epoch": 3.858626198083067, + "grad_norm": 0.22534102201461792, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4831 + }, + { + "epoch": 3.8594249201277955, + "grad_norm": 0.08298768103122711, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4832 + }, + { + "epoch": 3.860223642172524, + "grad_norm": 0.16647395491600037, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4833 + }, + { + "epoch": 3.8610223642172525, + "grad_norm": 0.22512534260749817, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4834 + }, + { + "epoch": 3.8618210862619806, + "grad_norm": 0.2130710482597351, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4835 + }, + { + "epoch": 3.862619808306709, + "grad_norm": 0.1250864863395691, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4836 + }, + { + "epoch": 3.8634185303514377, + "grad_norm": 0.13937048614025116, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4837 + }, + { + "epoch": 3.864217252396166, + "grad_norm": 0.19059741497039795, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4838 + }, + { + "epoch": 3.8650159744408947, + "grad_norm": 0.22080829739570618, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4839 + }, + { + "epoch": 3.8658146964856233, + "grad_norm": 0.09463749825954437, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 4840 + }, + { + "epoch": 3.8666134185303513, + "grad_norm": 0.16431698203086853, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4841 + }, + { + "epoch": 3.86741214057508, + "grad_norm": 0.2162260264158249, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4842 + }, + { + "epoch": 3.8682108626198084, + "grad_norm": 0.0789603665471077, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4843 + }, + { + "epoch": 3.8690095846645365, + "grad_norm": 0.18372099101543427, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4844 + }, + { + "epoch": 3.869808306709265, + "grad_norm": 0.24845194816589355, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4845 + }, + { + "epoch": 3.8706070287539935, + "grad_norm": 0.22064632177352905, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4846 + }, + { + "epoch": 3.871405750798722, + "grad_norm": 0.0718264952301979, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4847 + }, + { + "epoch": 3.8722044728434506, + "grad_norm": 0.2048031985759735, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4848 + }, + { + "epoch": 3.873003194888179, + "grad_norm": 0.23190200328826904, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4849 + }, + { + "epoch": 3.873801916932907, + "grad_norm": 0.06851150840520859, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4850 + }, + { + "epoch": 3.8746006389776357, + "grad_norm": 0.2371164858341217, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4851 + }, + { + "epoch": 3.8753993610223643, + "grad_norm": 0.23518243432044983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4852 + }, + { + "epoch": 3.876198083067093, + "grad_norm": 0.08026961237192154, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4853 + }, + { + "epoch": 3.876996805111821, + "grad_norm": 0.1623634397983551, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4854 + }, + { + "epoch": 3.8777955271565494, + "grad_norm": 0.21676453948020935, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4855 + }, + { + "epoch": 3.878594249201278, + "grad_norm": 0.07868681848049164, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4856 + }, + { + "epoch": 3.8793929712460065, + "grad_norm": 0.18302997946739197, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4857 + }, + { + "epoch": 3.880191693290735, + "grad_norm": 0.2338407188653946, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4858 + }, + { + "epoch": 3.8809904153354635, + "grad_norm": 0.2534898817539215, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4859 + }, + { + "epoch": 3.8817891373801916, + "grad_norm": 0.19988521933555603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4860 + }, + { + "epoch": 3.88258785942492, + "grad_norm": 0.2896076440811157, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4861 + }, + { + "epoch": 3.8833865814696487, + "grad_norm": 0.1088651567697525, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4862 + }, + { + "epoch": 3.8841853035143767, + "grad_norm": 0.18549342453479767, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4863 + }, + { + "epoch": 3.8849840255591053, + "grad_norm": 0.24760019779205322, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4864 + }, + { + "epoch": 3.885782747603834, + "grad_norm": 0.1323750913143158, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4865 + }, + { + "epoch": 3.8865814696485623, + "grad_norm": 0.14235283434391022, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4866 + }, + { + "epoch": 3.887380191693291, + "grad_norm": 0.20409083366394043, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4867 + }, + { + "epoch": 3.8881789137380194, + "grad_norm": 0.1743297129869461, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4868 + }, + { + "epoch": 3.8889776357827475, + "grad_norm": 0.09692966938018799, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4869 + }, + { + "epoch": 3.889776357827476, + "grad_norm": 0.09934467077255249, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4870 + }, + { + "epoch": 3.8905750798722045, + "grad_norm": 0.2410827875137329, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4871 + }, + { + "epoch": 3.891373801916933, + "grad_norm": 0.27096229791641235, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4872 + }, + { + "epoch": 3.892172523961661, + "grad_norm": 0.09133906662464142, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4873 + }, + { + "epoch": 3.8929712460063897, + "grad_norm": 0.20275604724884033, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4874 + }, + { + "epoch": 3.893769968051118, + "grad_norm": 0.19578030705451965, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4875 + }, + { + "epoch": 3.8945686900958467, + "grad_norm": 0.12888970971107483, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4876 + }, + { + "epoch": 3.8953674121405752, + "grad_norm": 0.10301528871059418, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4877 + }, + { + "epoch": 3.8961661341853038, + "grad_norm": 0.1635914444923401, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4878 + }, + { + "epoch": 3.896964856230032, + "grad_norm": 0.1971803456544876, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4879 + }, + { + "epoch": 3.8977635782747604, + "grad_norm": 0.1085273027420044, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4880 + }, + { + "epoch": 3.898562300319489, + "grad_norm": 0.07375707477331161, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4881 + }, + { + "epoch": 3.899361022364217, + "grad_norm": 0.5828747153282166, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4882 + }, + { + "epoch": 3.9001597444089455, + "grad_norm": 0.10320120304822922, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4883 + }, + { + "epoch": 3.900958466453674, + "grad_norm": 0.10118676722049713, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4884 + }, + { + "epoch": 3.9017571884984026, + "grad_norm": 0.22034543752670288, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4885 + }, + { + "epoch": 3.902555910543131, + "grad_norm": 0.21823646128177643, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4886 + }, + { + "epoch": 3.9033546325878596, + "grad_norm": 0.14776065945625305, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4887 + }, + { + "epoch": 3.9041533546325877, + "grad_norm": 0.13297663629055023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4888 + }, + { + "epoch": 3.9049520766773163, + "grad_norm": 0.4447253942489624, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4889 + }, + { + "epoch": 3.905750798722045, + "grad_norm": 0.171112522482872, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4890 + }, + { + "epoch": 3.9065495207667733, + "grad_norm": 0.1581616848707199, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4891 + }, + { + "epoch": 3.9073482428115014, + "grad_norm": 0.18396562337875366, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4892 + }, + { + "epoch": 3.90814696485623, + "grad_norm": 0.15952393412590027, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4893 + }, + { + "epoch": 3.9089456869009584, + "grad_norm": 0.12889564037322998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4894 + }, + { + "epoch": 3.909744408945687, + "grad_norm": 0.130104660987854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4895 + }, + { + "epoch": 3.9105431309904155, + "grad_norm": 0.13011464476585388, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4896 + }, + { + "epoch": 3.911341853035144, + "grad_norm": 0.06485363095998764, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4897 + }, + { + "epoch": 3.912140575079872, + "grad_norm": 0.11353932321071625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4898 + }, + { + "epoch": 3.9129392971246006, + "grad_norm": 0.13279879093170166, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4899 + }, + { + "epoch": 3.913738019169329, + "grad_norm": 0.19181469082832336, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 4900 + }, + { + "epoch": 3.9145367412140573, + "grad_norm": 0.06930892914533615, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4901 + }, + { + "epoch": 3.915335463258786, + "grad_norm": 0.10591714829206467, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4902 + }, + { + "epoch": 3.9161341853035143, + "grad_norm": 0.09693296998739243, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4903 + }, + { + "epoch": 3.916932907348243, + "grad_norm": 0.1604270488023758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4904 + }, + { + "epoch": 3.9177316293929714, + "grad_norm": 0.19874586164951324, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4905 + }, + { + "epoch": 3.9185303514377, + "grad_norm": 0.09015987068414688, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4906 + }, + { + "epoch": 3.919329073482428, + "grad_norm": 0.09864864498376846, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4907 + }, + { + "epoch": 3.9201277955271565, + "grad_norm": 0.12509673833847046, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4908 + }, + { + "epoch": 3.920926517571885, + "grad_norm": 0.10216362774372101, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4909 + }, + { + "epoch": 3.9217252396166136, + "grad_norm": 0.11854741722345352, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4910 + }, + { + "epoch": 3.9225239616613417, + "grad_norm": 0.08570919930934906, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4911 + }, + { + "epoch": 3.92332268370607, + "grad_norm": 0.095781609416008, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4912 + }, + { + "epoch": 3.9241214057507987, + "grad_norm": 0.05698491260409355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4913 + }, + { + "epoch": 3.9249201277955272, + "grad_norm": 0.09786297380924225, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4914 + }, + { + "epoch": 3.9257188498402558, + "grad_norm": 0.1206512302160263, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4915 + }, + { + "epoch": 3.9265175718849843, + "grad_norm": 0.07593982666730881, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4916 + }, + { + "epoch": 3.9273162939297124, + "grad_norm": 0.06973730027675629, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4917 + }, + { + "epoch": 3.928115015974441, + "grad_norm": 0.07377546280622482, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4918 + }, + { + "epoch": 3.9289137380191694, + "grad_norm": 0.06871537119150162, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4919 + }, + { + "epoch": 3.9297124600638975, + "grad_norm": 0.09697525203227997, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4920 + }, + { + "epoch": 3.930511182108626, + "grad_norm": 0.07418478280305862, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4921 + }, + { + "epoch": 3.9313099041533546, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4922 + }, + { + "epoch": 3.932108626198083, + "grad_norm": 0.08099815994501114, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4923 + }, + { + "epoch": 3.9329073482428116, + "grad_norm": 0.08033913373947144, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4924 + }, + { + "epoch": 3.93370607028754, + "grad_norm": 0.1089775413274765, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4925 + }, + { + "epoch": 3.9345047923322682, + "grad_norm": 0.06866748631000519, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4926 + }, + { + "epoch": 3.9353035143769968, + "grad_norm": 0.12346489727497101, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4927 + }, + { + "epoch": 3.9361022364217253, + "grad_norm": 0.1388891190290451, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4928 + }, + { + "epoch": 3.936900958466454, + "grad_norm": 0.12678411602973938, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4929 + }, + { + "epoch": 3.937699680511182, + "grad_norm": 0.08638305962085724, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4930 + }, + { + "epoch": 3.9384984025559104, + "grad_norm": 0.667020320892334, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 4931 + }, + { + "epoch": 3.939297124600639, + "grad_norm": 0.0867542177438736, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4932 + }, + { + "epoch": 3.9400958466453675, + "grad_norm": 0.1075657457113266, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4933 + }, + { + "epoch": 3.940894568690096, + "grad_norm": 0.10359356552362442, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4934 + }, + { + "epoch": 3.9416932907348246, + "grad_norm": 0.04861772805452347, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4935 + }, + { + "epoch": 3.9424920127795526, + "grad_norm": 0.08871651440858841, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4936 + }, + { + "epoch": 3.943290734824281, + "grad_norm": 0.05268944799900055, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4937 + }, + { + "epoch": 3.9440894568690097, + "grad_norm": 0.11428069323301315, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4938 + }, + { + "epoch": 3.9448881789137378, + "grad_norm": 0.1302616149187088, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4939 + }, + { + "epoch": 3.9456869009584663, + "grad_norm": 0.09091098606586456, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4940 + }, + { + "epoch": 3.946485623003195, + "grad_norm": 0.23224923014640808, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4941 + }, + { + "epoch": 3.9472843450479234, + "grad_norm": 0.13427230715751648, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4942 + }, + { + "epoch": 3.948083067092652, + "grad_norm": 0.24157744646072388, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4943 + }, + { + "epoch": 3.9488817891373804, + "grad_norm": 0.15497569739818573, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4944 + }, + { + "epoch": 3.9496805111821085, + "grad_norm": 0.15587151050567627, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4945 + }, + { + "epoch": 3.950479233226837, + "grad_norm": 0.0827038437128067, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4946 + }, + { + "epoch": 3.9512779552715656, + "grad_norm": 0.17405007779598236, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4947 + }, + { + "epoch": 3.952076677316294, + "grad_norm": 0.1612532138824463, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4948 + }, + { + "epoch": 3.952875399361022, + "grad_norm": 0.07505665719509125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4949 + }, + { + "epoch": 3.9536741214057507, + "grad_norm": 0.07138567417860031, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4950 + }, + { + "epoch": 3.9544728434504792, + "grad_norm": 0.09206511080265045, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4951 + }, + { + "epoch": 3.9552715654952078, + "grad_norm": 0.09190725535154343, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4952 + }, + { + "epoch": 3.9560702875399363, + "grad_norm": 0.13024544715881348, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4953 + }, + { + "epoch": 3.956869009584665, + "grad_norm": 0.08161026239395142, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4954 + }, + { + "epoch": 3.957667731629393, + "grad_norm": 0.17207187414169312, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4955 + }, + { + "epoch": 3.9584664536741214, + "grad_norm": 0.096051886677742, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4956 + }, + { + "epoch": 3.95926517571885, + "grad_norm": 0.11038299649953842, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4957 + }, + { + "epoch": 3.960063897763578, + "grad_norm": 0.09957583248615265, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4958 + }, + { + "epoch": 3.9608626198083066, + "grad_norm": 0.06923667341470718, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4959 + }, + { + "epoch": 3.961661341853035, + "grad_norm": 0.07572069019079208, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4960 + }, + { + "epoch": 3.9624600638977636, + "grad_norm": 0.16801652312278748, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4961 + }, + { + "epoch": 3.963258785942492, + "grad_norm": 0.062117498368024826, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4962 + }, + { + "epoch": 3.9640575079872207, + "grad_norm": 0.08293396979570389, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4963 + }, + { + "epoch": 3.9648562300319488, + "grad_norm": 0.2021675407886505, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4964 + }, + { + "epoch": 3.9656549520766773, + "grad_norm": 0.10666973143815994, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4965 + }, + { + "epoch": 3.966453674121406, + "grad_norm": 0.09226572513580322, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4966 + }, + { + "epoch": 3.9672523961661343, + "grad_norm": 0.10113741457462311, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4967 + }, + { + "epoch": 3.9680511182108624, + "grad_norm": 0.10156626254320145, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4968 + }, + { + "epoch": 3.968849840255591, + "grad_norm": 0.08531442284584045, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4969 + }, + { + "epoch": 3.9696485623003195, + "grad_norm": 0.08894761651754379, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4970 + }, + { + "epoch": 3.970447284345048, + "grad_norm": 0.07934322953224182, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4971 + }, + { + "epoch": 3.9712460063897765, + "grad_norm": 0.07121701538562775, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4972 + }, + { + "epoch": 3.972044728434505, + "grad_norm": 0.09110251814126968, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4973 + }, + { + "epoch": 3.972843450479233, + "grad_norm": 0.09724952280521393, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4974 + }, + { + "epoch": 3.9736421725239617, + "grad_norm": 0.08619683235883713, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4975 + }, + { + "epoch": 3.97444089456869, + "grad_norm": 0.14789989590644836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4976 + }, + { + "epoch": 3.9752396166134183, + "grad_norm": 0.08736634254455566, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4977 + }, + { + "epoch": 3.976038338658147, + "grad_norm": 0.2260635793209076, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4978 + }, + { + "epoch": 3.9768370607028753, + "grad_norm": 0.2150910496711731, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4979 + }, + { + "epoch": 3.977635782747604, + "grad_norm": 0.12071242183446884, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4980 + }, + { + "epoch": 3.9784345047923324, + "grad_norm": 0.11614276468753815, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4981 + }, + { + "epoch": 3.979233226837061, + "grad_norm": 0.0954839214682579, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4982 + }, + { + "epoch": 3.980031948881789, + "grad_norm": 0.09801400452852249, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4983 + }, + { + "epoch": 3.9808306709265175, + "grad_norm": 0.07435343414545059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4984 + }, + { + "epoch": 3.981629392971246, + "grad_norm": 0.09401766955852509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4985 + }, + { + "epoch": 3.9824281150159746, + "grad_norm": 0.09850753843784332, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4986 + }, + { + "epoch": 3.9832268370607027, + "grad_norm": 0.07880235463380814, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4987 + }, + { + "epoch": 3.984025559105431, + "grad_norm": 0.08208848536014557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4988 + }, + { + "epoch": 3.9848242811501597, + "grad_norm": 0.10432668030261993, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4989 + }, + { + "epoch": 3.9856230031948883, + "grad_norm": 0.05202944204211235, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4990 + }, + { + "epoch": 3.986421725239617, + "grad_norm": 0.0831860601902008, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4991 + }, + { + "epoch": 3.987220447284345, + "grad_norm": 0.1084689050912857, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4992 + }, + { + "epoch": 3.9880191693290734, + "grad_norm": 0.1095893383026123, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4993 + }, + { + "epoch": 3.988817891373802, + "grad_norm": 0.24480414390563965, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4994 + }, + { + "epoch": 3.9896166134185305, + "grad_norm": 0.11939835548400879, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4995 + }, + { + "epoch": 3.9904153354632586, + "grad_norm": 0.0829034298658371, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4996 + }, + { + "epoch": 3.991214057507987, + "grad_norm": 0.1649356484413147, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4997 + }, + { + "epoch": 3.9920127795527156, + "grad_norm": 0.18428824841976166, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4998 + }, + { + "epoch": 3.992811501597444, + "grad_norm": 0.14441022276878357, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4999 + }, + { + "epoch": 3.9936102236421727, + "grad_norm": 0.1025838553905487, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5000 + }, + { + "epoch": 3.994408945686901, + "grad_norm": 0.18659353256225586, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5001 + }, + { + "epoch": 3.9952076677316293, + "grad_norm": 0.18462489545345306, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5002 + }, + { + "epoch": 3.996006389776358, + "grad_norm": 0.11221570521593094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5003 + }, + { + "epoch": 3.9968051118210863, + "grad_norm": 0.1611207127571106, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5004 + }, + { + "epoch": 3.997603833865815, + "grad_norm": 0.10003258287906647, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5005 + }, + { + "epoch": 3.998402555910543, + "grad_norm": 0.06686410307884216, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5006 + }, + { + "epoch": 3.9992012779552715, + "grad_norm": 0.07527180016040802, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5007 + }, + { + "epoch": 4.0, + "grad_norm": 0.11602520197629929, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5008 + }, + { + "epoch": 4.0007987220447285, + "grad_norm": 0.04460546746850014, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5009 + }, + { + "epoch": 4.001597444089457, + "grad_norm": 1.1286108493804932, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5010 + }, + { + "epoch": 4.002396166134186, + "grad_norm": 0.12730571627616882, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5011 + }, + { + "epoch": 4.003194888178914, + "grad_norm": 0.060798924416303635, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5012 + }, + { + "epoch": 4.003993610223642, + "grad_norm": 0.11491188406944275, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5013 + }, + { + "epoch": 4.00479233226837, + "grad_norm": 0.09877663850784302, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5014 + }, + { + "epoch": 4.005591054313099, + "grad_norm": 0.06991511583328247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5015 + }, + { + "epoch": 4.006389776357827, + "grad_norm": 0.05524459481239319, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5016 + }, + { + "epoch": 4.007188498402556, + "grad_norm": 0.07421471178531647, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5017 + }, + { + "epoch": 4.007987220447284, + "grad_norm": 0.10918284207582474, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5018 + }, + { + "epoch": 4.008785942492013, + "grad_norm": 0.42926761507987976, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5019 + }, + { + "epoch": 4.0095846645367414, + "grad_norm": 0.12511351704597473, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5020 + }, + { + "epoch": 4.01038338658147, + "grad_norm": 0.0985826924443245, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5021 + }, + { + "epoch": 4.0111821086261985, + "grad_norm": 0.10876046866178513, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5022 + }, + { + "epoch": 4.011980830670926, + "grad_norm": 0.0973401740193367, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5023 + }, + { + "epoch": 4.012779552715655, + "grad_norm": 0.10867046564817429, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5024 + }, + { + "epoch": 4.013578274760383, + "grad_norm": 0.16030259430408478, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5025 + }, + { + "epoch": 4.014376996805112, + "grad_norm": 0.09972470998764038, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5026 + }, + { + "epoch": 4.01517571884984, + "grad_norm": 0.06945701688528061, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5027 + }, + { + "epoch": 4.015974440894569, + "grad_norm": 0.12256570160388947, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5028 + }, + { + "epoch": 4.016773162939297, + "grad_norm": 0.1318589597940445, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5029 + }, + { + "epoch": 4.017571884984026, + "grad_norm": 0.14831772446632385, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5030 + }, + { + "epoch": 4.018370607028754, + "grad_norm": 0.12650129199028015, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5031 + }, + { + "epoch": 4.019169329073482, + "grad_norm": 0.25457820296287537, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5032 + }, + { + "epoch": 4.0199680511182105, + "grad_norm": 0.10183271020650864, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5033 + }, + { + "epoch": 4.020766773162939, + "grad_norm": 0.14198726415634155, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5034 + }, + { + "epoch": 4.021565495207668, + "grad_norm": 0.1551627218723297, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5035 + }, + { + "epoch": 4.022364217252396, + "grad_norm": 0.29212328791618347, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5036 + }, + { + "epoch": 4.023162939297125, + "grad_norm": 0.25203290581703186, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5037 + }, + { + "epoch": 4.023961661341853, + "grad_norm": 0.12793950736522675, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5038 + }, + { + "epoch": 4.024760383386582, + "grad_norm": 0.10916420817375183, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5039 + }, + { + "epoch": 4.02555910543131, + "grad_norm": 0.09980735182762146, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5040 + }, + { + "epoch": 4.026357827476039, + "grad_norm": 0.1633901745080948, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5041 + }, + { + "epoch": 4.027156549520766, + "grad_norm": 0.10058299452066422, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5042 + }, + { + "epoch": 4.027955271565495, + "grad_norm": 0.08121561259031296, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5043 + }, + { + "epoch": 4.0287539936102235, + "grad_norm": 0.19947005808353424, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5044 + }, + { + "epoch": 4.029552715654952, + "grad_norm": 0.24219068884849548, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5045 + }, + { + "epoch": 4.0303514376996805, + "grad_norm": 0.28928735852241516, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5046 + }, + { + "epoch": 4.031150159744409, + "grad_norm": 0.062404267489910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5047 + }, + { + "epoch": 4.031948881789138, + "grad_norm": 0.1607569456100464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5048 + }, + { + "epoch": 4.032747603833866, + "grad_norm": 0.14420244097709656, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5049 + }, + { + "epoch": 4.033546325878595, + "grad_norm": 0.838013768196106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5050 + }, + { + "epoch": 4.034345047923322, + "grad_norm": 0.15198078751564026, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5051 + }, + { + "epoch": 4.035143769968051, + "grad_norm": 0.18439999222755432, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5052 + }, + { + "epoch": 4.035942492012779, + "grad_norm": 0.1283460259437561, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5053 + }, + { + "epoch": 4.036741214057508, + "grad_norm": 0.07285412400960922, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5054 + }, + { + "epoch": 4.037539936102236, + "grad_norm": 0.21856451034545898, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5055 + }, + { + "epoch": 4.038338658146965, + "grad_norm": 0.1934041529893875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5056 + }, + { + "epoch": 4.039137380191693, + "grad_norm": 0.07998216152191162, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5057 + }, + { + "epoch": 4.039936102236422, + "grad_norm": 0.2202988713979721, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5058 + }, + { + "epoch": 4.0407348242811505, + "grad_norm": 0.22000271081924438, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5059 + }, + { + "epoch": 4.041533546325879, + "grad_norm": 0.06229308247566223, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5060 + }, + { + "epoch": 4.042332268370607, + "grad_norm": 0.19611188769340515, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5061 + }, + { + "epoch": 4.043130990415335, + "grad_norm": 0.2385999858379364, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5062 + }, + { + "epoch": 4.043929712460064, + "grad_norm": 0.06504995375871658, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5063 + }, + { + "epoch": 4.044728434504792, + "grad_norm": 0.17860567569732666, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5064 + }, + { + "epoch": 4.045527156549521, + "grad_norm": 0.17580853402614594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5065 + }, + { + "epoch": 4.046325878594249, + "grad_norm": 0.06523217260837555, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5066 + }, + { + "epoch": 4.047124600638978, + "grad_norm": 0.2795565128326416, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5067 + }, + { + "epoch": 4.047923322683706, + "grad_norm": 0.289105623960495, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5068 + }, + { + "epoch": 4.048722044728435, + "grad_norm": 0.07829197496175766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5069 + }, + { + "epoch": 4.0495207667731625, + "grad_norm": 0.24165435135364532, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5070 + }, + { + "epoch": 4.050319488817891, + "grad_norm": 0.2785094976425171, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5071 + }, + { + "epoch": 4.05111821086262, + "grad_norm": 0.08929550647735596, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5072 + }, + { + "epoch": 4.051916932907348, + "grad_norm": 0.24677781760692596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5073 + }, + { + "epoch": 4.052715654952077, + "grad_norm": 0.25207674503326416, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5074 + }, + { + "epoch": 4.053514376996805, + "grad_norm": 0.06409729272127151, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5075 + }, + { + "epoch": 4.054313099041534, + "grad_norm": 0.2670205235481262, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5076 + }, + { + "epoch": 4.055111821086262, + "grad_norm": 0.1854943484067917, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5077 + }, + { + "epoch": 4.055910543130991, + "grad_norm": 0.1409354954957962, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5078 + }, + { + "epoch": 4.056709265175719, + "grad_norm": 0.24084609746932983, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5079 + }, + { + "epoch": 4.057507987220447, + "grad_norm": 0.16520382463932037, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5080 + }, + { + "epoch": 4.0583067092651754, + "grad_norm": 0.11086967587471008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5081 + }, + { + "epoch": 4.059105431309904, + "grad_norm": 0.15748612582683563, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5082 + }, + { + "epoch": 4.0599041533546325, + "grad_norm": 0.1196034848690033, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5083 + }, + { + "epoch": 4.060702875399361, + "grad_norm": 0.06799823045730591, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5084 + }, + { + "epoch": 4.06150159744409, + "grad_norm": 0.1223025768995285, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5085 + }, + { + "epoch": 4.062300319488818, + "grad_norm": 0.04760991781949997, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5086 + }, + { + "epoch": 4.063099041533547, + "grad_norm": 0.11782078444957733, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5087 + }, + { + "epoch": 4.063897763578275, + "grad_norm": 0.13057227432727814, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5088 + }, + { + "epoch": 4.064696485623003, + "grad_norm": 0.0719611644744873, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5089 + }, + { + "epoch": 4.065495207667731, + "grad_norm": 0.13513247668743134, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5090 + }, + { + "epoch": 4.06629392971246, + "grad_norm": 0.14960692822933197, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5091 + }, + { + "epoch": 4.067092651757188, + "grad_norm": 0.06219497323036194, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5092 + }, + { + "epoch": 4.067891373801917, + "grad_norm": 0.06755383312702179, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5093 + }, + { + "epoch": 4.068690095846645, + "grad_norm": 0.08237830549478531, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5094 + }, + { + "epoch": 4.069488817891374, + "grad_norm": 0.0915946289896965, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5095 + }, + { + "epoch": 4.0702875399361025, + "grad_norm": 0.06893479824066162, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5096 + }, + { + "epoch": 4.071086261980831, + "grad_norm": 0.04133071005344391, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5097 + }, + { + "epoch": 4.0718849840255595, + "grad_norm": 0.062333185225725174, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5098 + }, + { + "epoch": 4.072683706070287, + "grad_norm": 0.05741016939282417, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5099 + }, + { + "epoch": 4.073482428115016, + "grad_norm": 0.04988866671919823, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5100 + }, + { + "epoch": 4.074281150159744, + "grad_norm": 0.050187818706035614, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5101 + }, + { + "epoch": 4.075079872204473, + "grad_norm": 0.08479643613100052, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5102 + }, + { + "epoch": 4.075878594249201, + "grad_norm": 0.13840351998806, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5103 + }, + { + "epoch": 4.07667731629393, + "grad_norm": 0.11400903016328812, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5104 + }, + { + "epoch": 4.077476038338658, + "grad_norm": 0.06956811994314194, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5105 + }, + { + "epoch": 4.078274760383387, + "grad_norm": 0.09173833578824997, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5106 + }, + { + "epoch": 4.079073482428115, + "grad_norm": 0.09024006128311157, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5107 + }, + { + "epoch": 4.079872204472843, + "grad_norm": 0.04257406294345856, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5108 + }, + { + "epoch": 4.080670926517572, + "grad_norm": 0.04252707585692406, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5109 + }, + { + "epoch": 4.0814696485623, + "grad_norm": 0.052367035299539566, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5110 + }, + { + "epoch": 4.082268370607029, + "grad_norm": 0.06344939023256302, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5111 + }, + { + "epoch": 4.083067092651757, + "grad_norm": 0.04674215242266655, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5112 + }, + { + "epoch": 4.083865814696486, + "grad_norm": 0.03664534166455269, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5113 + }, + { + "epoch": 4.084664536741214, + "grad_norm": 0.07198764383792877, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5114 + }, + { + "epoch": 4.085463258785943, + "grad_norm": 0.06294529885053635, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5115 + }, + { + "epoch": 4.086261980830671, + "grad_norm": 0.09595668315887451, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5116 + }, + { + "epoch": 4.0870607028754, + "grad_norm": 0.09830893576145172, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5117 + }, + { + "epoch": 4.087859424920127, + "grad_norm": 0.09647611528635025, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5118 + }, + { + "epoch": 4.088658146964856, + "grad_norm": 0.04558149725198746, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5119 + }, + { + "epoch": 4.0894568690095845, + "grad_norm": 0.11090628057718277, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5120 + }, + { + "epoch": 4.090255591054313, + "grad_norm": 0.1119648665189743, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5121 + }, + { + "epoch": 4.0910543130990416, + "grad_norm": 0.0372939296066761, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5122 + }, + { + "epoch": 4.09185303514377, + "grad_norm": 0.10749047994613647, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5123 + }, + { + "epoch": 4.092651757188499, + "grad_norm": 0.08718341588973999, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5124 + }, + { + "epoch": 4.093450479233227, + "grad_norm": 0.04954478517174721, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5125 + }, + { + "epoch": 4.094249201277956, + "grad_norm": 0.0599503293633461, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5126 + }, + { + "epoch": 4.095047923322683, + "grad_norm": 0.04633599892258644, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5127 + }, + { + "epoch": 4.095846645367412, + "grad_norm": 0.0502074733376503, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5128 + }, + { + "epoch": 4.09664536741214, + "grad_norm": 0.1348472684621811, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5129 + }, + { + "epoch": 4.097444089456869, + "grad_norm": 0.07534858584403992, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5130 + }, + { + "epoch": 4.098242811501597, + "grad_norm": 0.04207107052206993, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5131 + }, + { + "epoch": 4.099041533546326, + "grad_norm": 0.062090687453746796, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5132 + }, + { + "epoch": 4.0998402555910545, + "grad_norm": 0.08783479779958725, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5133 + }, + { + "epoch": 4.100638977635783, + "grad_norm": 0.04489055275917053, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5134 + }, + { + "epoch": 4.1014376996805115, + "grad_norm": 0.07360105961561203, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5135 + }, + { + "epoch": 4.102236421725239, + "grad_norm": 0.10253020375967026, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5136 + }, + { + "epoch": 4.103035143769968, + "grad_norm": 0.12787389755249023, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5137 + }, + { + "epoch": 4.103833865814696, + "grad_norm": 0.43946513533592224, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5138 + }, + { + "epoch": 4.104632587859425, + "grad_norm": 0.7717093825340271, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5139 + }, + { + "epoch": 4.105431309904153, + "grad_norm": 0.1433849334716797, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5140 + }, + { + "epoch": 4.106230031948882, + "grad_norm": 0.09110052138566971, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5141 + }, + { + "epoch": 4.10702875399361, + "grad_norm": 0.13785111904144287, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5142 + }, + { + "epoch": 4.107827476038339, + "grad_norm": 0.0910695344209671, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5143 + }, + { + "epoch": 4.108626198083067, + "grad_norm": 0.10390721261501312, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5144 + }, + { + "epoch": 4.109424920127796, + "grad_norm": 0.07039178162813187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5145 + }, + { + "epoch": 4.110223642172524, + "grad_norm": 0.08536665886640549, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5146 + }, + { + "epoch": 4.111022364217252, + "grad_norm": 0.1355360597372055, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5147 + }, + { + "epoch": 4.111821086261981, + "grad_norm": 0.13981834053993225, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5148 + }, + { + "epoch": 4.112619808306709, + "grad_norm": 0.12653453648090363, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5149 + }, + { + "epoch": 4.113418530351438, + "grad_norm": 0.06805716454982758, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5150 + }, + { + "epoch": 4.114217252396166, + "grad_norm": 0.14361023902893066, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5151 + }, + { + "epoch": 4.115015974440895, + "grad_norm": 0.15223950147628784, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5152 + }, + { + "epoch": 4.115814696485623, + "grad_norm": 0.10013193637132645, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5153 + }, + { + "epoch": 4.116613418530352, + "grad_norm": 0.21049730479717255, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5154 + }, + { + "epoch": 4.11741214057508, + "grad_norm": 0.1393776834011078, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5155 + }, + { + "epoch": 4.118210862619808, + "grad_norm": 0.08584857732057571, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5156 + }, + { + "epoch": 4.1190095846645365, + "grad_norm": 0.06729432195425034, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5157 + }, + { + "epoch": 4.119808306709265, + "grad_norm": 0.08861853927373886, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5158 + }, + { + "epoch": 4.1206070287539935, + "grad_norm": 0.07037574052810669, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5159 + }, + { + "epoch": 4.121405750798722, + "grad_norm": 0.08049193024635315, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5160 + }, + { + "epoch": 4.122204472843451, + "grad_norm": 0.09040962159633636, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5161 + }, + { + "epoch": 4.123003194888179, + "grad_norm": 0.06531825661659241, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5162 + }, + { + "epoch": 4.123801916932908, + "grad_norm": 0.09423618763685226, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5163 + }, + { + "epoch": 4.124600638977636, + "grad_norm": 0.09436366707086563, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5164 + }, + { + "epoch": 4.125399361022364, + "grad_norm": 0.07543698698282242, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5165 + }, + { + "epoch": 4.126198083067092, + "grad_norm": 0.07491134852170944, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 5166 + }, + { + "epoch": 4.126996805111821, + "grad_norm": 0.09040437638759613, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5167 + }, + { + "epoch": 4.127795527156549, + "grad_norm": 0.11145798116922379, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5168 + }, + { + "epoch": 4.128594249201278, + "grad_norm": 0.35186707973480225, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5169 + }, + { + "epoch": 4.1293929712460065, + "grad_norm": 0.08744635432958603, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5170 + }, + { + "epoch": 4.130191693290735, + "grad_norm": 0.1078719049692154, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5171 + }, + { + "epoch": 4.1309904153354635, + "grad_norm": 0.13568760454654694, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5172 + }, + { + "epoch": 4.131789137380192, + "grad_norm": 0.10629335045814514, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5173 + }, + { + "epoch": 4.13258785942492, + "grad_norm": 0.3467697203159332, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5174 + }, + { + "epoch": 4.133386581469648, + "grad_norm": 0.5514479875564575, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5175 + }, + { + "epoch": 4.134185303514377, + "grad_norm": 0.2762874960899353, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5176 + }, + { + "epoch": 4.134984025559105, + "grad_norm": 0.25959524512290955, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5177 + }, + { + "epoch": 4.135782747603834, + "grad_norm": 0.26429036259651184, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5178 + }, + { + "epoch": 4.136581469648562, + "grad_norm": 0.4492235779762268, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5179 + }, + { + "epoch": 4.137380191693291, + "grad_norm": 0.3261977732181549, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 5180 + }, + { + "epoch": 4.138178913738019, + "grad_norm": 0.15618108212947845, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5181 + }, + { + "epoch": 4.138977635782748, + "grad_norm": 0.2897289991378784, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5182 + }, + { + "epoch": 4.139776357827476, + "grad_norm": 0.2599884271621704, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5183 + }, + { + "epoch": 4.140575079872204, + "grad_norm": 0.3158198893070221, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5184 + }, + { + "epoch": 4.141373801916933, + "grad_norm": 0.2701073884963989, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5185 + }, + { + "epoch": 4.142172523961661, + "grad_norm": 0.14668017625808716, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5186 + }, + { + "epoch": 4.14297124600639, + "grad_norm": 0.14284202456474304, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5187 + }, + { + "epoch": 4.143769968051118, + "grad_norm": 0.1901128888130188, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5188 + }, + { + "epoch": 4.144568690095847, + "grad_norm": 0.17808575928211212, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5189 + }, + { + "epoch": 4.145367412140575, + "grad_norm": 0.11329478025436401, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5190 + }, + { + "epoch": 4.146166134185304, + "grad_norm": 0.10816467553377151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5191 + }, + { + "epoch": 4.146964856230032, + "grad_norm": 0.11593834310770035, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5192 + }, + { + "epoch": 4.147763578274761, + "grad_norm": 0.17315705120563507, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5193 + }, + { + "epoch": 4.1485623003194885, + "grad_norm": 0.10884186625480652, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5194 + }, + { + "epoch": 4.149361022364217, + "grad_norm": 0.17528203129768372, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5195 + }, + { + "epoch": 4.1501597444089455, + "grad_norm": 0.3249641954898834, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5196 + }, + { + "epoch": 4.150958466453674, + "grad_norm": 0.2920859456062317, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5197 + }, + { + "epoch": 4.151757188498403, + "grad_norm": 0.12487918138504028, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5198 + }, + { + "epoch": 4.152555910543131, + "grad_norm": 0.07744348049163818, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5199 + }, + { + "epoch": 4.15335463258786, + "grad_norm": 0.11721999943256378, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5200 + }, + { + "epoch": 4.154153354632588, + "grad_norm": 0.17566390335559845, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5201 + }, + { + "epoch": 4.154952076677317, + "grad_norm": 0.09762726724147797, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5202 + }, + { + "epoch": 4.155750798722044, + "grad_norm": 0.10769844055175781, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5203 + }, + { + "epoch": 4.156549520766773, + "grad_norm": 0.1608363389968872, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5204 + }, + { + "epoch": 4.157348242811501, + "grad_norm": 0.1575978696346283, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5205 + }, + { + "epoch": 4.15814696485623, + "grad_norm": 0.2035059779882431, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5206 + }, + { + "epoch": 4.1589456869009584, + "grad_norm": 0.1405210644006729, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5207 + }, + { + "epoch": 4.159744408945687, + "grad_norm": 0.18898408114910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5208 + }, + { + "epoch": 4.1605431309904155, + "grad_norm": 0.20012563467025757, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5209 + }, + { + "epoch": 4.161341853035144, + "grad_norm": 0.14585568010807037, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5210 + }, + { + "epoch": 4.162140575079873, + "grad_norm": 0.166448175907135, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5211 + }, + { + "epoch": 4.1629392971246, + "grad_norm": 0.08768735080957413, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5212 + }, + { + "epoch": 4.163738019169329, + "grad_norm": 0.12429258227348328, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5213 + }, + { + "epoch": 4.164536741214057, + "grad_norm": 0.06750953942537308, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5214 + }, + { + "epoch": 4.165335463258786, + "grad_norm": 0.10137717425823212, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5215 + }, + { + "epoch": 4.166134185303514, + "grad_norm": 0.1015368178486824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5216 + }, + { + "epoch": 4.166932907348243, + "grad_norm": 0.12396319955587387, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5217 + }, + { + "epoch": 4.167731629392971, + "grad_norm": 0.11295704543590546, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5218 + }, + { + "epoch": 4.1685303514377, + "grad_norm": 0.1415906846523285, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5219 + }, + { + "epoch": 4.169329073482428, + "grad_norm": 0.1300252079963684, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5220 + }, + { + "epoch": 4.170127795527157, + "grad_norm": 0.09486760199069977, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5221 + }, + { + "epoch": 4.170926517571885, + "grad_norm": 0.25776198506355286, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5222 + }, + { + "epoch": 4.171725239616613, + "grad_norm": 0.07684944570064545, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5223 + }, + { + "epoch": 4.172523961661342, + "grad_norm": 0.06909538060426712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5224 + }, + { + "epoch": 4.17332268370607, + "grad_norm": 0.09686419367790222, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5225 + }, + { + "epoch": 4.174121405750799, + "grad_norm": 0.10760180652141571, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5226 + }, + { + "epoch": 4.174920127795527, + "grad_norm": 0.0963902473449707, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5227 + }, + { + "epoch": 4.175718849840256, + "grad_norm": 0.12986192107200623, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5228 + }, + { + "epoch": 4.176517571884984, + "grad_norm": 0.12532354891300201, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5229 + }, + { + "epoch": 4.177316293929713, + "grad_norm": 0.158639058470726, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5230 + }, + { + "epoch": 4.178115015974441, + "grad_norm": 0.10025905817747116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5231 + }, + { + "epoch": 4.178913738019169, + "grad_norm": 0.19150952994823456, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5232 + }, + { + "epoch": 4.1797124600638975, + "grad_norm": 0.10650201886892319, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5233 + }, + { + "epoch": 4.180511182108626, + "grad_norm": 0.08948210626840591, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5234 + }, + { + "epoch": 4.181309904153355, + "grad_norm": 0.144260972738266, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5235 + }, + { + "epoch": 4.182108626198083, + "grad_norm": 0.10631201416254044, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5236 + }, + { + "epoch": 4.182907348242812, + "grad_norm": 0.17884188890457153, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5237 + }, + { + "epoch": 4.18370607028754, + "grad_norm": 0.12393054366111755, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5238 + }, + { + "epoch": 4.184504792332269, + "grad_norm": 0.10113117098808289, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5239 + }, + { + "epoch": 4.185303514376997, + "grad_norm": 0.08745535463094711, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5240 + }, + { + "epoch": 4.186102236421725, + "grad_norm": 0.12319829314947128, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5241 + }, + { + "epoch": 4.186900958466453, + "grad_norm": 0.10202868282794952, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5242 + }, + { + "epoch": 4.187699680511182, + "grad_norm": 0.12799306213855743, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5243 + }, + { + "epoch": 4.18849840255591, + "grad_norm": 0.10247227549552917, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5244 + }, + { + "epoch": 4.189297124600639, + "grad_norm": 0.0876200944185257, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5245 + }, + { + "epoch": 4.1900958466453675, + "grad_norm": 0.08829693496227264, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5246 + }, + { + "epoch": 4.190894568690096, + "grad_norm": 0.09005091339349747, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5247 + }, + { + "epoch": 4.1916932907348246, + "grad_norm": 0.06715424358844757, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5248 + }, + { + "epoch": 4.192492012779553, + "grad_norm": 0.11082255840301514, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5249 + }, + { + "epoch": 4.193290734824281, + "grad_norm": 0.08197743445634842, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5250 + }, + { + "epoch": 4.194089456869009, + "grad_norm": 0.08641887456178665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5251 + }, + { + "epoch": 4.194888178913738, + "grad_norm": 0.29264676570892334, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5252 + }, + { + "epoch": 4.195686900958466, + "grad_norm": 0.10122201591730118, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5253 + }, + { + "epoch": 4.196485623003195, + "grad_norm": 0.13220930099487305, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5254 + }, + { + "epoch": 4.197284345047923, + "grad_norm": 0.05919777229428291, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5255 + }, + { + "epoch": 4.198083067092652, + "grad_norm": 0.15947407484054565, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5256 + }, + { + "epoch": 4.19888178913738, + "grad_norm": 0.08046088367700577, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5257 + }, + { + "epoch": 4.199680511182109, + "grad_norm": 0.08504491299390793, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5258 + }, + { + "epoch": 4.2004792332268375, + "grad_norm": 0.2523876428604126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5259 + }, + { + "epoch": 4.201277955271565, + "grad_norm": 0.32436496019363403, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5260 + }, + { + "epoch": 4.202076677316294, + "grad_norm": 0.3832956552505493, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5261 + }, + { + "epoch": 4.202875399361022, + "grad_norm": 0.15481804311275482, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5262 + }, + { + "epoch": 4.203674121405751, + "grad_norm": 0.5061212182044983, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5263 + }, + { + "epoch": 4.204472843450479, + "grad_norm": 0.2778873145580292, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5264 + }, + { + "epoch": 4.205271565495208, + "grad_norm": 0.10782434046268463, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5265 + }, + { + "epoch": 4.206070287539936, + "grad_norm": 0.2730430066585541, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5266 + }, + { + "epoch": 4.206869009584665, + "grad_norm": 0.14902958273887634, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5267 + }, + { + "epoch": 4.207667731629393, + "grad_norm": 0.2455812245607376, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5268 + }, + { + "epoch": 4.208466453674121, + "grad_norm": 0.36285653710365295, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5269 + }, + { + "epoch": 4.2092651757188495, + "grad_norm": 0.16104358434677124, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5270 + }, + { + "epoch": 4.210063897763578, + "grad_norm": 0.10330995172262192, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5271 + }, + { + "epoch": 4.210862619808307, + "grad_norm": 0.14438849687576294, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5272 + }, + { + "epoch": 4.211661341853035, + "grad_norm": 0.11719724535942078, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5273 + }, + { + "epoch": 4.212460063897764, + "grad_norm": 0.13503463566303253, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5274 + }, + { + "epoch": 4.213258785942492, + "grad_norm": 0.12717710435390472, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5275 + }, + { + "epoch": 4.214057507987221, + "grad_norm": 0.12293769419193268, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5276 + }, + { + "epoch": 4.214856230031949, + "grad_norm": 0.11828786134719849, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5277 + }, + { + "epoch": 4.215654952076678, + "grad_norm": 0.11118468642234802, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5278 + }, + { + "epoch": 4.216453674121405, + "grad_norm": 0.15688025951385498, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5279 + }, + { + "epoch": 4.217252396166134, + "grad_norm": 0.10603991895914078, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5280 + }, + { + "epoch": 4.218051118210862, + "grad_norm": 0.14034971594810486, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5281 + }, + { + "epoch": 4.218849840255591, + "grad_norm": 0.21270571649074554, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5282 + }, + { + "epoch": 4.2196485623003195, + "grad_norm": 0.17699144780635834, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5283 + }, + { + "epoch": 4.220447284345048, + "grad_norm": 0.07665220648050308, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5284 + }, + { + "epoch": 4.2212460063897765, + "grad_norm": 0.13917282223701477, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5285 + }, + { + "epoch": 4.222044728434505, + "grad_norm": 0.1253320872783661, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5286 + }, + { + "epoch": 4.222843450479234, + "grad_norm": 0.07693646103143692, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5287 + }, + { + "epoch": 4.223642172523961, + "grad_norm": 0.11877891421318054, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5288 + }, + { + "epoch": 4.22444089456869, + "grad_norm": 0.08900399506092072, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5289 + }, + { + "epoch": 4.225239616613418, + "grad_norm": 0.08575741946697235, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5290 + }, + { + "epoch": 4.226038338658147, + "grad_norm": 0.11078973859548569, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5291 + }, + { + "epoch": 4.226837060702875, + "grad_norm": 0.12371394783258438, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5292 + }, + { + "epoch": 4.227635782747604, + "grad_norm": 0.11741651594638824, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5293 + }, + { + "epoch": 4.228434504792332, + "grad_norm": 0.1316244751214981, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5294 + }, + { + "epoch": 4.229233226837061, + "grad_norm": 0.07751733064651489, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5295 + }, + { + "epoch": 4.2300319488817895, + "grad_norm": 0.13512739539146423, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5296 + }, + { + "epoch": 4.230830670926518, + "grad_norm": 0.14408327639102936, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5297 + }, + { + "epoch": 4.231629392971246, + "grad_norm": 0.05596759170293808, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5298 + }, + { + "epoch": 4.232428115015974, + "grad_norm": 0.20518198609352112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5299 + }, + { + "epoch": 4.233226837060703, + "grad_norm": 0.17000356316566467, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5300 + }, + { + "epoch": 4.234025559105431, + "grad_norm": 0.10213350504636765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5301 + }, + { + "epoch": 4.23482428115016, + "grad_norm": 0.1633368879556656, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 4.235623003194888, + "grad_norm": 0.17330236732959747, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5303 + }, + { + "epoch": 4.236421725239617, + "grad_norm": 0.20028679072856903, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5304 + }, + { + "epoch": 4.237220447284345, + "grad_norm": 0.23386533558368683, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5305 + }, + { + "epoch": 4.238019169329074, + "grad_norm": 0.051739469170570374, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5306 + }, + { + "epoch": 4.2388178913738015, + "grad_norm": 0.19732257723808289, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5307 + }, + { + "epoch": 4.23961661341853, + "grad_norm": 0.1318890005350113, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5308 + }, + { + "epoch": 4.2404153354632586, + "grad_norm": 0.17188113927841187, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5309 + }, + { + "epoch": 4.241214057507987, + "grad_norm": 0.23981456458568573, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5310 + }, + { + "epoch": 4.242012779552716, + "grad_norm": 0.15658913552761078, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5311 + }, + { + "epoch": 4.242811501597444, + "grad_norm": 0.13481132686138153, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5312 + }, + { + "epoch": 4.243610223642173, + "grad_norm": 0.16327355802059174, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5313 + }, + { + "epoch": 4.244408945686901, + "grad_norm": 0.0873674675822258, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5314 + }, + { + "epoch": 4.24520766773163, + "grad_norm": 0.16612505912780762, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5315 + }, + { + "epoch": 4.246006389776358, + "grad_norm": 0.15376444160938263, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5316 + }, + { + "epoch": 4.246805111821086, + "grad_norm": 0.07853512465953827, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5317 + }, + { + "epoch": 4.247603833865814, + "grad_norm": 0.11799992620944977, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5318 + }, + { + "epoch": 4.248402555910543, + "grad_norm": 0.09121575206518173, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 5319 + }, + { + "epoch": 4.2492012779552715, + "grad_norm": 0.09780153632164001, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5320 + }, + { + "epoch": 4.25, + "grad_norm": 0.11387690156698227, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5321 + }, + { + "epoch": 4.2507987220447285, + "grad_norm": 0.08085697889328003, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5322 + }, + { + "epoch": 4.251597444089457, + "grad_norm": 0.09986089169979095, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5323 + }, + { + "epoch": 4.252396166134186, + "grad_norm": 0.07728606462478638, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5324 + }, + { + "epoch": 4.253194888178914, + "grad_norm": 0.07464555650949478, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5325 + }, + { + "epoch": 4.253993610223642, + "grad_norm": 0.05129759758710861, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5326 + }, + { + "epoch": 4.25479233226837, + "grad_norm": 0.060275599360466, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5327 + }, + { + "epoch": 4.255591054313099, + "grad_norm": 0.07773016393184662, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5328 + }, + { + "epoch": 4.256389776357827, + "grad_norm": 0.1046462282538414, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5329 + }, + { + "epoch": 4.257188498402556, + "grad_norm": 0.1184321865439415, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5330 + }, + { + "epoch": 4.257987220447284, + "grad_norm": 0.1419631987810135, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5331 + }, + { + "epoch": 4.258785942492013, + "grad_norm": 0.10022144019603729, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5332 + }, + { + "epoch": 4.2595846645367414, + "grad_norm": 0.075701504945755, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5333 + }, + { + "epoch": 4.26038338658147, + "grad_norm": 0.18145573139190674, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5334 + }, + { + "epoch": 4.261182108626198, + "grad_norm": 0.06092703342437744, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5335 + }, + { + "epoch": 4.261980830670926, + "grad_norm": 0.13196219503879547, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5336 + }, + { + "epoch": 4.262779552715655, + "grad_norm": 0.17139793932437897, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5337 + }, + { + "epoch": 4.263578274760383, + "grad_norm": 0.12072623521089554, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5338 + }, + { + "epoch": 4.264376996805112, + "grad_norm": 0.11874449253082275, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5339 + }, + { + "epoch": 4.26517571884984, + "grad_norm": 0.10718921571969986, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5340 + }, + { + "epoch": 4.265974440894569, + "grad_norm": 0.07337968051433563, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5341 + }, + { + "epoch": 4.266773162939297, + "grad_norm": 0.11872536689043045, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5342 + }, + { + "epoch": 4.267571884984026, + "grad_norm": 0.11199923604726791, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5343 + }, + { + "epoch": 4.268370607028754, + "grad_norm": 0.05864759162068367, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5344 + }, + { + "epoch": 4.269169329073483, + "grad_norm": 0.14757969975471497, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5345 + }, + { + "epoch": 4.2699680511182105, + "grad_norm": 0.12190169841051102, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5346 + }, + { + "epoch": 4.270766773162939, + "grad_norm": 0.0532461479306221, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5347 + }, + { + "epoch": 4.271565495207668, + "grad_norm": 0.10723208636045456, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5348 + }, + { + "epoch": 4.272364217252396, + "grad_norm": 0.07115229964256287, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5349 + }, + { + "epoch": 4.273162939297125, + "grad_norm": 0.07450878620147705, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 5350 + }, + { + "epoch": 4.273961661341853, + "grad_norm": 0.11793115735054016, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5351 + }, + { + "epoch": 4.274760383386582, + "grad_norm": 0.10440219938755035, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5352 + }, + { + "epoch": 4.27555910543131, + "grad_norm": 0.27991926670074463, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5353 + }, + { + "epoch": 4.276357827476039, + "grad_norm": 0.11090446263551712, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5354 + }, + { + "epoch": 4.277156549520766, + "grad_norm": 0.10509627312421799, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5355 + }, + { + "epoch": 4.277955271565495, + "grad_norm": 0.06217970326542854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5356 + }, + { + "epoch": 4.2787539936102235, + "grad_norm": 0.34369224309921265, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5357 + }, + { + "epoch": 4.279552715654952, + "grad_norm": 0.1246214285492897, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5358 + }, + { + "epoch": 4.2803514376996805, + "grad_norm": 0.06331677734851837, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5359 + }, + { + "epoch": 4.281150159744409, + "grad_norm": 0.08274740725755692, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5360 + }, + { + "epoch": 4.281948881789138, + "grad_norm": 0.06133527308702469, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5361 + }, + { + "epoch": 4.282747603833866, + "grad_norm": 0.09867174178361893, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5362 + }, + { + "epoch": 4.283546325878595, + "grad_norm": 0.09370579570531845, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5363 + }, + { + "epoch": 4.284345047923322, + "grad_norm": 0.2549540400505066, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5364 + }, + { + "epoch": 4.285143769968051, + "grad_norm": 0.1900271773338318, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5365 + }, + { + "epoch": 4.285942492012779, + "grad_norm": 0.21450525522232056, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5366 + }, + { + "epoch": 4.286741214057508, + "grad_norm": 0.1381012350320816, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5367 + }, + { + "epoch": 4.287539936102236, + "grad_norm": 0.0813983827829361, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5368 + }, + { + "epoch": 4.288338658146965, + "grad_norm": 0.16513130068778992, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5369 + }, + { + "epoch": 4.289137380191693, + "grad_norm": 0.10825667530298233, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5370 + }, + { + "epoch": 4.289936102236422, + "grad_norm": 0.07226242125034332, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5371 + }, + { + "epoch": 4.2907348242811505, + "grad_norm": 0.1278400719165802, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5372 + }, + { + "epoch": 4.291533546325878, + "grad_norm": 0.11092592030763626, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5373 + }, + { + "epoch": 4.292332268370607, + "grad_norm": 0.08732229471206665, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5374 + }, + { + "epoch": 4.293130990415335, + "grad_norm": 0.2182341367006302, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5375 + }, + { + "epoch": 4.293929712460064, + "grad_norm": 0.10107403993606567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5376 + }, + { + "epoch": 4.294728434504792, + "grad_norm": 0.13586364686489105, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5377 + }, + { + "epoch": 4.295527156549521, + "grad_norm": 0.3685734272003174, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5378 + }, + { + "epoch": 4.296325878594249, + "grad_norm": 0.13060712814331055, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5379 + }, + { + "epoch": 4.297124600638978, + "grad_norm": 0.05988436937332153, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5380 + }, + { + "epoch": 4.297923322683706, + "grad_norm": 0.14392045140266418, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 5381 + }, + { + "epoch": 4.298722044728435, + "grad_norm": 0.25003254413604736, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5382 + }, + { + "epoch": 4.2995207667731625, + "grad_norm": 0.055451687425374985, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5383 + }, + { + "epoch": 4.300319488817891, + "grad_norm": 0.11186914891004562, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5384 + }, + { + "epoch": 4.30111821086262, + "grad_norm": 0.11314704269170761, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5385 + }, + { + "epoch": 4.301916932907348, + "grad_norm": 0.43445560336112976, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5386 + }, + { + "epoch": 4.302715654952077, + "grad_norm": 0.09362242370843887, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5387 + }, + { + "epoch": 4.303514376996805, + "grad_norm": 0.04405852034687996, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5388 + }, + { + "epoch": 4.304313099041534, + "grad_norm": 0.12615318596363068, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5389 + }, + { + "epoch": 4.305111821086262, + "grad_norm": 0.1067153736948967, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5390 + }, + { + "epoch": 4.305910543130991, + "grad_norm": 0.05732683837413788, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5391 + }, + { + "epoch": 4.306709265175719, + "grad_norm": 0.2452571988105774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5392 + }, + { + "epoch": 4.307507987220447, + "grad_norm": 0.11733133345842361, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5393 + }, + { + "epoch": 4.3083067092651754, + "grad_norm": 0.06771894544363022, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5394 + }, + { + "epoch": 4.309105431309904, + "grad_norm": 0.12928563356399536, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5395 + }, + { + "epoch": 4.3099041533546325, + "grad_norm": 0.1777956187725067, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5396 + }, + { + "epoch": 4.310702875399361, + "grad_norm": 0.1281544715166092, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5397 + }, + { + "epoch": 4.31150159744409, + "grad_norm": 0.07120000571012497, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5398 + }, + { + "epoch": 4.312300319488818, + "grad_norm": 0.1270848512649536, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5399 + }, + { + "epoch": 4.313099041533547, + "grad_norm": 0.17685648798942566, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5400 + }, + { + "epoch": 4.313897763578275, + "grad_norm": 0.05070900544524193, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5401 + }, + { + "epoch": 4.314696485623003, + "grad_norm": 0.10543418675661087, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5402 + }, + { + "epoch": 4.315495207667731, + "grad_norm": 0.12336398661136627, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5403 + }, + { + "epoch": 4.31629392971246, + "grad_norm": 0.1583624631166458, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5404 + }, + { + "epoch": 4.317092651757188, + "grad_norm": 0.08186022192239761, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5405 + }, + { + "epoch": 4.317891373801917, + "grad_norm": 0.07562705129384995, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5406 + }, + { + "epoch": 4.318690095846645, + "grad_norm": 0.05275554209947586, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5407 + }, + { + "epoch": 4.319488817891374, + "grad_norm": 0.06432928144931793, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5408 + }, + { + "epoch": 4.3202875399361025, + "grad_norm": 0.08220377564430237, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5409 + }, + { + "epoch": 4.321086261980831, + "grad_norm": 0.07882758229970932, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5410 + }, + { + "epoch": 4.321884984025559, + "grad_norm": 0.138245090842247, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5411 + }, + { + "epoch": 4.322683706070287, + "grad_norm": 0.1127534806728363, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5412 + }, + { + "epoch": 4.323482428115016, + "grad_norm": 0.1985669732093811, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5413 + }, + { + "epoch": 4.324281150159744, + "grad_norm": 0.08023711293935776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5414 + }, + { + "epoch": 4.325079872204473, + "grad_norm": 0.13853015005588531, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5415 + }, + { + "epoch": 4.325878594249201, + "grad_norm": 0.18319782614707947, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5416 + }, + { + "epoch": 4.32667731629393, + "grad_norm": 0.073015958070755, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5417 + }, + { + "epoch": 4.327476038338658, + "grad_norm": 0.10771846771240234, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5418 + }, + { + "epoch": 4.328274760383387, + "grad_norm": 0.09512028843164444, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5419 + }, + { + "epoch": 4.329073482428115, + "grad_norm": 0.0822201818227768, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5420 + }, + { + "epoch": 4.329872204472843, + "grad_norm": 0.11839213222265244, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5421 + }, + { + "epoch": 4.330670926517572, + "grad_norm": 0.10274796187877655, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5422 + }, + { + "epoch": 4.3314696485623, + "grad_norm": 0.05896717682480812, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5423 + }, + { + "epoch": 4.332268370607029, + "grad_norm": 0.1268780380487442, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5424 + }, + { + "epoch": 4.333067092651757, + "grad_norm": 0.09173188358545303, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5425 + }, + { + "epoch": 4.333865814696486, + "grad_norm": 0.05155360326170921, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5426 + }, + { + "epoch": 4.334664536741214, + "grad_norm": 0.08836793899536133, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 4.335463258785943, + "grad_norm": 0.08620470017194748, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5428 + }, + { + "epoch": 4.336261980830671, + "grad_norm": 0.06972123682498932, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5429 + }, + { + "epoch": 4.3370607028754, + "grad_norm": 0.12461638450622559, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5430 + }, + { + "epoch": 4.337859424920127, + "grad_norm": 0.08546463400125504, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5431 + }, + { + "epoch": 4.338658146964856, + "grad_norm": 0.08495177328586578, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5432 + }, + { + "epoch": 4.3394568690095845, + "grad_norm": 0.13017377257347107, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5433 + }, + { + "epoch": 4.340255591054313, + "grad_norm": 0.13619504868984222, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5434 + }, + { + "epoch": 4.3410543130990416, + "grad_norm": 0.5835675597190857, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5435 + }, + { + "epoch": 4.34185303514377, + "grad_norm": 0.09355206042528152, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5436 + }, + { + "epoch": 4.342651757188499, + "grad_norm": 0.08626751601696014, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5437 + }, + { + "epoch": 4.343450479233227, + "grad_norm": 0.05652647092938423, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5438 + }, + { + "epoch": 4.344249201277956, + "grad_norm": 0.05232316255569458, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5439 + }, + { + "epoch": 4.345047923322683, + "grad_norm": 0.08115233480930328, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5440 + }, + { + "epoch": 4.345846645367412, + "grad_norm": 0.08757120370864868, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5441 + }, + { + "epoch": 4.34664536741214, + "grad_norm": 0.046224139630794525, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5442 + }, + { + "epoch": 4.347444089456869, + "grad_norm": 0.07967934757471085, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5443 + }, + { + "epoch": 4.348242811501597, + "grad_norm": 0.044298652559518814, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5444 + }, + { + "epoch": 4.349041533546326, + "grad_norm": 0.09021158516407013, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5445 + }, + { + "epoch": 4.3498402555910545, + "grad_norm": 0.12857890129089355, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5446 + }, + { + "epoch": 4.350638977635783, + "grad_norm": 0.05655589699745178, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5447 + }, + { + "epoch": 4.3514376996805115, + "grad_norm": 0.09304624050855637, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5448 + }, + { + "epoch": 4.352236421725239, + "grad_norm": 0.19815632700920105, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5449 + }, + { + "epoch": 4.353035143769968, + "grad_norm": 0.0526299886405468, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5450 + }, + { + "epoch": 4.353833865814696, + "grad_norm": 0.06432242691516876, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5451 + }, + { + "epoch": 4.354632587859425, + "grad_norm": 0.07848794758319855, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5452 + }, + { + "epoch": 4.355431309904153, + "grad_norm": 0.08260536193847656, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5453 + }, + { + "epoch": 4.356230031948882, + "grad_norm": 0.052810169756412506, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5454 + }, + { + "epoch": 4.35702875399361, + "grad_norm": 0.06942226737737656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5455 + }, + { + "epoch": 4.357827476038339, + "grad_norm": 0.13892871141433716, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5456 + }, + { + "epoch": 4.358626198083067, + "grad_norm": 0.15982909500598907, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5457 + }, + { + "epoch": 4.359424920127796, + "grad_norm": 0.08206653594970703, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5458 + }, + { + "epoch": 4.360223642172524, + "grad_norm": 0.08957790583372116, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5459 + }, + { + "epoch": 4.361022364217252, + "grad_norm": 0.03882770985364914, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5460 + }, + { + "epoch": 4.361821086261981, + "grad_norm": 0.0928555279970169, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5461 + }, + { + "epoch": 4.362619808306709, + "grad_norm": 0.057321447879076004, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5462 + }, + { + "epoch": 4.363418530351438, + "grad_norm": 0.0737103596329689, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5463 + }, + { + "epoch": 4.364217252396166, + "grad_norm": 0.06696293503046036, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5464 + }, + { + "epoch": 4.365015974440895, + "grad_norm": 0.04572489857673645, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5465 + }, + { + "epoch": 4.365814696485623, + "grad_norm": 0.094516322016716, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5466 + }, + { + "epoch": 4.366613418530352, + "grad_norm": 0.045576825737953186, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5467 + }, + { + "epoch": 4.36741214057508, + "grad_norm": 0.06839725375175476, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5468 + }, + { + "epoch": 4.368210862619808, + "grad_norm": 0.14465193450450897, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5469 + }, + { + "epoch": 4.3690095846645365, + "grad_norm": 0.07930073887109756, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5470 + }, + { + "epoch": 4.369808306709265, + "grad_norm": 0.06120619550347328, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5471 + }, + { + "epoch": 4.3706070287539935, + "grad_norm": 0.066256083548069, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5472 + }, + { + "epoch": 4.371405750798722, + "grad_norm": 0.11696353554725647, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5473 + }, + { + "epoch": 4.372204472843451, + "grad_norm": 0.11530395597219467, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5474 + }, + { + "epoch": 4.373003194888179, + "grad_norm": 0.05663579702377319, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5475 + }, + { + "epoch": 4.373801916932908, + "grad_norm": 0.1241946592926979, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5476 + }, + { + "epoch": 4.374600638977636, + "grad_norm": 0.1725323498249054, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5477 + }, + { + "epoch": 4.375399361022364, + "grad_norm": 0.09785371273756027, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5478 + }, + { + "epoch": 4.376198083067092, + "grad_norm": 0.0813792496919632, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5479 + }, + { + "epoch": 4.376996805111821, + "grad_norm": 0.17471592128276825, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5480 + }, + { + "epoch": 4.377795527156549, + "grad_norm": 0.1923220455646515, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5481 + }, + { + "epoch": 4.378594249201278, + "grad_norm": 0.09857932478189468, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5482 + }, + { + "epoch": 4.3793929712460065, + "grad_norm": 0.10073419660329819, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5483 + }, + { + "epoch": 4.380191693290735, + "grad_norm": 0.35731273889541626, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5484 + }, + { + "epoch": 4.3809904153354635, + "grad_norm": 0.12060656398534775, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5485 + }, + { + "epoch": 4.381789137380192, + "grad_norm": 0.10264381766319275, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5486 + }, + { + "epoch": 4.38258785942492, + "grad_norm": 0.0868317037820816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5487 + }, + { + "epoch": 4.383386581469648, + "grad_norm": 0.07722344994544983, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5488 + }, + { + "epoch": 4.384185303514377, + "grad_norm": 0.3690173327922821, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5489 + }, + { + "epoch": 4.384984025559105, + "grad_norm": 0.18400169909000397, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5490 + }, + { + "epoch": 4.385782747603834, + "grad_norm": 0.14671844244003296, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5491 + }, + { + "epoch": 4.386581469648562, + "grad_norm": 0.05277179554104805, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5492 + }, + { + "epoch": 4.387380191693291, + "grad_norm": 0.13593660295009613, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5493 + }, + { + "epoch": 4.388178913738019, + "grad_norm": 0.1318334937095642, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5494 + }, + { + "epoch": 4.388977635782748, + "grad_norm": 0.07189908623695374, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5495 + }, + { + "epoch": 4.389776357827476, + "grad_norm": 0.07969736307859421, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5496 + }, + { + "epoch": 4.390575079872204, + "grad_norm": 0.07449150085449219, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5497 + }, + { + "epoch": 4.391373801916933, + "grad_norm": 0.533295214176178, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5498 + }, + { + "epoch": 4.392172523961661, + "grad_norm": 0.10412111133337021, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5499 + }, + { + "epoch": 4.39297124600639, + "grad_norm": 0.08482066541910172, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5500 + }, + { + "epoch": 4.393769968051118, + "grad_norm": 0.08023949712514877, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5501 + }, + { + "epoch": 4.394568690095847, + "grad_norm": 0.16967490315437317, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5502 + }, + { + "epoch": 4.395367412140575, + "grad_norm": 0.1979716271162033, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5503 + }, + { + "epoch": 4.396166134185304, + "grad_norm": 0.09058263152837753, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5504 + }, + { + "epoch": 4.396964856230032, + "grad_norm": 0.13149574398994446, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5505 + }, + { + "epoch": 4.397763578274761, + "grad_norm": 0.08240146189928055, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5506 + }, + { + "epoch": 4.3985623003194885, + "grad_norm": 0.13789936900138855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5507 + }, + { + "epoch": 4.399361022364217, + "grad_norm": 0.18576087057590485, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5508 + }, + { + "epoch": 4.4001597444089455, + "grad_norm": 0.13780297338962555, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5509 + }, + { + "epoch": 4.400958466453674, + "grad_norm": 0.14724896848201752, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5510 + }, + { + "epoch": 4.401757188498403, + "grad_norm": 0.20418551564216614, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5511 + }, + { + "epoch": 4.402555910543131, + "grad_norm": 0.1841040551662445, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5512 + }, + { + "epoch": 4.40335463258786, + "grad_norm": 0.6994684338569641, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5513 + }, + { + "epoch": 4.404153354632588, + "grad_norm": 0.18882393836975098, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5514 + }, + { + "epoch": 4.404952076677317, + "grad_norm": 0.07170864939689636, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5515 + }, + { + "epoch": 4.405750798722044, + "grad_norm": 0.04765893518924713, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5516 + }, + { + "epoch": 4.406549520766773, + "grad_norm": 0.07294443249702454, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5517 + }, + { + "epoch": 4.407348242811501, + "grad_norm": 0.18566831946372986, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 5518 + }, + { + "epoch": 4.40814696485623, + "grad_norm": 0.10881441831588745, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5519 + }, + { + "epoch": 4.4089456869009584, + "grad_norm": 0.380438894033432, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5520 + }, + { + "epoch": 4.409744408945687, + "grad_norm": 0.19281962513923645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5521 + }, + { + "epoch": 4.4105431309904155, + "grad_norm": 0.05730361491441727, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5522 + }, + { + "epoch": 4.411341853035144, + "grad_norm": 0.09276643395423889, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5523 + }, + { + "epoch": 4.412140575079873, + "grad_norm": 0.070807084441185, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5524 + }, + { + "epoch": 4.4129392971246, + "grad_norm": 0.08902080357074738, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5525 + }, + { + "epoch": 4.413738019169329, + "grad_norm": 0.14861932396888733, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5526 + }, + { + "epoch": 4.414536741214057, + "grad_norm": 0.2678995728492737, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5527 + }, + { + "epoch": 4.415335463258786, + "grad_norm": 0.12902382016181946, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5528 + }, + { + "epoch": 4.416134185303514, + "grad_norm": 0.14999063313007355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5529 + }, + { + "epoch": 4.416932907348243, + "grad_norm": 0.13950730860233307, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5530 + }, + { + "epoch": 4.417731629392971, + "grad_norm": 0.12215374410152435, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5531 + }, + { + "epoch": 4.4185303514377, + "grad_norm": 0.12941284477710724, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5532 + }, + { + "epoch": 4.419329073482428, + "grad_norm": 0.22524291276931763, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5533 + }, + { + "epoch": 4.420127795527157, + "grad_norm": 0.0830528736114502, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5534 + }, + { + "epoch": 4.420926517571885, + "grad_norm": 0.1562981903553009, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5535 + }, + { + "epoch": 4.421725239616613, + "grad_norm": 0.19052654504776, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5536 + }, + { + "epoch": 4.422523961661342, + "grad_norm": 0.12264347821474075, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5537 + }, + { + "epoch": 4.42332268370607, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5538 + }, + { + "epoch": 4.424121405750799, + "grad_norm": 0.1412813812494278, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5539 + }, + { + "epoch": 4.424920127795527, + "grad_norm": 0.17808450758457184, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5540 + }, + { + "epoch": 4.425718849840256, + "grad_norm": 0.43806061148643494, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5541 + }, + { + "epoch": 4.426517571884984, + "grad_norm": 0.17728228867053986, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5542 + }, + { + "epoch": 4.427316293929713, + "grad_norm": 0.12434227764606476, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5543 + }, + { + "epoch": 4.428115015974441, + "grad_norm": 0.10051420331001282, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5544 + }, + { + "epoch": 4.428913738019169, + "grad_norm": 0.0943203940987587, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5545 + }, + { + "epoch": 4.4297124600638975, + "grad_norm": 0.08082996308803558, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5546 + }, + { + "epoch": 4.430511182108626, + "grad_norm": 0.13405202329158783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5547 + }, + { + "epoch": 4.431309904153355, + "grad_norm": 0.10448389500379562, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5548 + }, + { + "epoch": 4.432108626198083, + "grad_norm": 0.32405009865760803, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5549 + }, + { + "epoch": 4.432907348242812, + "grad_norm": 0.09690065681934357, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5550 + }, + { + "epoch": 4.43370607028754, + "grad_norm": 0.35410076379776, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5551 + }, + { + "epoch": 4.434504792332269, + "grad_norm": 0.17826306819915771, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5552 + }, + { + "epoch": 4.435303514376997, + "grad_norm": 0.2252579778432846, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5553 + }, + { + "epoch": 4.436102236421725, + "grad_norm": 0.09508918970823288, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5554 + }, + { + "epoch": 4.436900958466453, + "grad_norm": 0.16872358322143555, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5555 + }, + { + "epoch": 4.437699680511182, + "grad_norm": 0.24836355447769165, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5556 + }, + { + "epoch": 4.43849840255591, + "grad_norm": 0.20887835323810577, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5557 + }, + { + "epoch": 4.439297124600639, + "grad_norm": 0.10922685265541077, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5558 + }, + { + "epoch": 4.4400958466453675, + "grad_norm": 0.44561028480529785, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5559 + }, + { + "epoch": 4.440894568690096, + "grad_norm": 0.18160179257392883, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5560 + }, + { + "epoch": 4.4416932907348246, + "grad_norm": 0.06924877315759659, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5561 + }, + { + "epoch": 4.442492012779553, + "grad_norm": 0.15605933964252472, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5562 + }, + { + "epoch": 4.443290734824281, + "grad_norm": 0.10880772024393082, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5563 + }, + { + "epoch": 4.444089456869009, + "grad_norm": 0.1252668797969818, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5564 + }, + { + "epoch": 4.444888178913738, + "grad_norm": 0.20452634990215302, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5565 + }, + { + "epoch": 4.445686900958466, + "grad_norm": 0.20973001420497894, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5566 + }, + { + "epoch": 4.446485623003195, + "grad_norm": 0.07631060481071472, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5567 + }, + { + "epoch": 4.447284345047923, + "grad_norm": 0.14793622493743896, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5568 + }, + { + "epoch": 4.448083067092652, + "grad_norm": 0.30125850439071655, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5569 + }, + { + "epoch": 4.44888178913738, + "grad_norm": 0.1291274130344391, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5570 + }, + { + "epoch": 4.449680511182109, + "grad_norm": 0.08679793030023575, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5571 + }, + { + "epoch": 4.4504792332268375, + "grad_norm": 0.11555953323841095, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5572 + }, + { + "epoch": 4.451277955271565, + "grad_norm": 0.10711846500635147, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5573 + }, + { + "epoch": 4.452076677316294, + "grad_norm": 0.0604897104203701, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5574 + }, + { + "epoch": 4.452875399361022, + "grad_norm": 0.08729933202266693, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5575 + }, + { + "epoch": 4.453674121405751, + "grad_norm": 0.09586715698242188, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5576 + }, + { + "epoch": 4.454472843450479, + "grad_norm": 0.11635993421077728, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5577 + }, + { + "epoch": 4.455271565495208, + "grad_norm": 0.12405801564455032, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5578 + }, + { + "epoch": 4.456070287539936, + "grad_norm": 0.1284986287355423, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5579 + }, + { + "epoch": 4.456869009584665, + "grad_norm": 0.09059973061084747, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5580 + }, + { + "epoch": 4.457667731629393, + "grad_norm": 0.08497101068496704, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5581 + }, + { + "epoch": 4.458466453674122, + "grad_norm": 0.10315481573343277, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5582 + }, + { + "epoch": 4.4592651757188495, + "grad_norm": 0.09923984855413437, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5583 + }, + { + "epoch": 4.460063897763578, + "grad_norm": 0.09179794788360596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5584 + }, + { + "epoch": 4.460862619808307, + "grad_norm": 0.0783005952835083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5585 + }, + { + "epoch": 4.461661341853035, + "grad_norm": 0.4005993604660034, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5586 + }, + { + "epoch": 4.462460063897764, + "grad_norm": 0.09382215887308121, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5587 + }, + { + "epoch": 4.463258785942492, + "grad_norm": 0.10208452492952347, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5588 + }, + { + "epoch": 4.464057507987221, + "grad_norm": 0.08237040042877197, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5589 + }, + { + "epoch": 4.464856230031949, + "grad_norm": 0.07287969440221786, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 5590 + }, + { + "epoch": 4.465654952076678, + "grad_norm": 0.07156763970851898, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5591 + }, + { + "epoch": 4.466453674121405, + "grad_norm": 0.11347219347953796, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5592 + }, + { + "epoch": 4.467252396166134, + "grad_norm": 0.13722039759159088, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5593 + }, + { + "epoch": 4.468051118210862, + "grad_norm": 0.20186153054237366, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5594 + }, + { + "epoch": 4.468849840255591, + "grad_norm": 0.1548159420490265, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5595 + }, + { + "epoch": 4.4696485623003195, + "grad_norm": 0.08960088342428207, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5596 + }, + { + "epoch": 4.470447284345048, + "grad_norm": 0.23552097380161285, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5597 + }, + { + "epoch": 4.4712460063897765, + "grad_norm": 0.34478914737701416, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5598 + }, + { + "epoch": 4.472044728434505, + "grad_norm": 0.219953253865242, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5599 + }, + { + "epoch": 4.472843450479234, + "grad_norm": 0.13104191422462463, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5600 + }, + { + "epoch": 4.473642172523961, + "grad_norm": 0.2867056131362915, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5601 + }, + { + "epoch": 4.47444089456869, + "grad_norm": 0.15794725716114044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5602 + }, + { + "epoch": 4.475239616613418, + "grad_norm": 0.10884165018796921, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5603 + }, + { + "epoch": 4.476038338658147, + "grad_norm": 1.0521267652511597, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5604 + }, + { + "epoch": 4.476837060702875, + "grad_norm": 0.07823536545038223, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5605 + }, + { + "epoch": 4.477635782747604, + "grad_norm": 0.1536101996898651, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5606 + }, + { + "epoch": 4.478434504792332, + "grad_norm": 0.1379251778125763, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5607 + }, + { + "epoch": 4.479233226837061, + "grad_norm": 0.06181122735142708, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5608 + }, + { + "epoch": 4.4800319488817895, + "grad_norm": 0.1701904535293579, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5609 + }, + { + "epoch": 4.480830670926517, + "grad_norm": 0.1322227120399475, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5610 + }, + { + "epoch": 4.481629392971246, + "grad_norm": 0.09158491343259811, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5611 + }, + { + "epoch": 4.482428115015974, + "grad_norm": 0.09851136803627014, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5612 + }, + { + "epoch": 4.483226837060703, + "grad_norm": 0.09350419789552689, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5613 + }, + { + "epoch": 4.484025559105431, + "grad_norm": 0.40614885091781616, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5614 + }, + { + "epoch": 4.48482428115016, + "grad_norm": 0.1653166264295578, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5615 + }, + { + "epoch": 4.485623003194888, + "grad_norm": 0.13429352641105652, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5616 + }, + { + "epoch": 4.486421725239617, + "grad_norm": 0.09340473264455795, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5617 + }, + { + "epoch": 4.487220447284345, + "grad_norm": 0.1621188223361969, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5618 + }, + { + "epoch": 4.488019169329074, + "grad_norm": 0.18538816273212433, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5619 + }, + { + "epoch": 4.488817891373802, + "grad_norm": 0.26981350779533386, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5620 + }, + { + "epoch": 4.48961661341853, + "grad_norm": 0.28865110874176025, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5621 + }, + { + "epoch": 4.4904153354632586, + "grad_norm": 0.23013874888420105, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5622 + }, + { + "epoch": 4.491214057507987, + "grad_norm": 0.08305853605270386, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5623 + }, + { + "epoch": 4.492012779552716, + "grad_norm": 0.1810445487499237, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5624 + }, + { + "epoch": 4.492811501597444, + "grad_norm": 0.23000332713127136, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5625 + }, + { + "epoch": 4.493610223642173, + "grad_norm": 0.06753652542829514, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5626 + }, + { + "epoch": 4.494408945686901, + "grad_norm": 0.19956068694591522, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5627 + }, + { + "epoch": 4.49520766773163, + "grad_norm": 0.24572248756885529, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5628 + }, + { + "epoch": 4.496006389776358, + "grad_norm": 0.06617605686187744, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5629 + }, + { + "epoch": 4.496805111821086, + "grad_norm": 0.18551495671272278, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.497603833865814, + "grad_norm": 0.16827648878097534, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5631 + }, + { + "epoch": 4.498402555910543, + "grad_norm": 0.13273993134498596, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5632 + }, + { + "epoch": 4.4992012779552715, + "grad_norm": 0.24461479485034943, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5633 + }, + { + "epoch": 4.5, + "grad_norm": 0.2016836553812027, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5634 + }, + { + "epoch": 4.5007987220447285, + "grad_norm": 0.07513006776571274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5635 + }, + { + "epoch": 4.501597444089457, + "grad_norm": 0.1701919138431549, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5636 + }, + { + "epoch": 4.502396166134186, + "grad_norm": 0.12785466015338898, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5637 + }, + { + "epoch": 4.503194888178914, + "grad_norm": 0.1135641485452652, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5638 + }, + { + "epoch": 4.503993610223642, + "grad_norm": 0.5004979372024536, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5639 + }, + { + "epoch": 4.50479233226837, + "grad_norm": 0.28730812668800354, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5640 + }, + { + "epoch": 4.505591054313099, + "grad_norm": 0.3666481673717499, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5641 + }, + { + "epoch": 4.506389776357827, + "grad_norm": 0.257710337638855, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5642 + }, + { + "epoch": 4.507188498402556, + "grad_norm": 0.20071941614151, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5643 + }, + { + "epoch": 4.507987220447284, + "grad_norm": 0.3445729613304138, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5644 + }, + { + "epoch": 4.508785942492013, + "grad_norm": 0.20297282934188843, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5645 + }, + { + "epoch": 4.5095846645367414, + "grad_norm": 0.1889636069536209, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5646 + }, + { + "epoch": 4.51038338658147, + "grad_norm": 0.2153794765472412, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5647 + }, + { + "epoch": 4.511182108626198, + "grad_norm": 0.15353621542453766, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5648 + }, + { + "epoch": 4.511980830670926, + "grad_norm": 0.1575399786233902, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5649 + }, + { + "epoch": 4.512779552715655, + "grad_norm": 0.5555608868598938, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5650 + }, + { + "epoch": 4.513578274760383, + "grad_norm": 0.26887524127960205, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5651 + }, + { + "epoch": 4.514376996805112, + "grad_norm": 0.11516866087913513, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5652 + }, + { + "epoch": 4.51517571884984, + "grad_norm": 0.19820965826511383, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5653 + }, + { + "epoch": 4.515974440894569, + "grad_norm": 0.2122081071138382, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5654 + }, + { + "epoch": 4.516773162939297, + "grad_norm": 0.10736703872680664, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5655 + }, + { + "epoch": 4.517571884984026, + "grad_norm": 0.09852312505245209, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5656 + }, + { + "epoch": 4.518370607028754, + "grad_norm": 0.07539162784814835, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5657 + }, + { + "epoch": 4.519169329073483, + "grad_norm": 0.07467353343963623, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5658 + }, + { + "epoch": 4.5199680511182105, + "grad_norm": 0.09987884759902954, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5659 + }, + { + "epoch": 4.520766773162939, + "grad_norm": 0.08720221370458603, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5660 + }, + { + "epoch": 4.521565495207668, + "grad_norm": 0.07798969000577927, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5661 + }, + { + "epoch": 4.522364217252396, + "grad_norm": 0.12410122901201248, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5662 + }, + { + "epoch": 4.523162939297125, + "grad_norm": 0.07746852934360504, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5663 + }, + { + "epoch": 4.523961661341853, + "grad_norm": 0.09171058982610703, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5664 + }, + { + "epoch": 4.524760383386582, + "grad_norm": 0.8176944255828857, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5665 + }, + { + "epoch": 4.52555910543131, + "grad_norm": 0.4282614290714264, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5666 + }, + { + "epoch": 4.526357827476039, + "grad_norm": 0.35193827748298645, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5667 + }, + { + "epoch": 4.527156549520766, + "grad_norm": 0.15641339123249054, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5668 + }, + { + "epoch": 4.527955271565495, + "grad_norm": 0.31442952156066895, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5669 + }, + { + "epoch": 4.5287539936102235, + "grad_norm": 0.3205500841140747, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5670 + }, + { + "epoch": 4.529552715654952, + "grad_norm": 0.2866390645503998, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5671 + }, + { + "epoch": 4.5303514376996805, + "grad_norm": 0.21028868854045868, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5672 + }, + { + "epoch": 4.531150159744409, + "grad_norm": 0.32687097787857056, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5673 + }, + { + "epoch": 4.531948881789138, + "grad_norm": 0.25662627816200256, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5674 + }, + { + "epoch": 4.532747603833866, + "grad_norm": 0.10192561894655228, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5675 + }, + { + "epoch": 4.533546325878595, + "grad_norm": 0.8102573752403259, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5676 + }, + { + "epoch": 4.534345047923322, + "grad_norm": 0.19127781689167023, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5677 + }, + { + "epoch": 4.535143769968051, + "grad_norm": 0.22435548901557922, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5678 + }, + { + "epoch": 4.535942492012779, + "grad_norm": 0.3271692395210266, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5679 + }, + { + "epoch": 4.536741214057508, + "grad_norm": 0.17226184904575348, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5680 + }, + { + "epoch": 4.537539936102236, + "grad_norm": 0.16628077626228333, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5681 + }, + { + "epoch": 4.538338658146965, + "grad_norm": 0.6196639537811279, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5682 + }, + { + "epoch": 4.539137380191693, + "grad_norm": 0.21590936183929443, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5683 + }, + { + "epoch": 4.539936102236422, + "grad_norm": 0.16313950717449188, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5684 + }, + { + "epoch": 4.5407348242811505, + "grad_norm": 0.12859022617340088, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5685 + }, + { + "epoch": 4.541533546325878, + "grad_norm": 0.1189458817243576, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5686 + }, + { + "epoch": 4.542332268370607, + "grad_norm": 6.769774913787842, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5687 + }, + { + "epoch": 4.543130990415335, + "grad_norm": 0.20253166556358337, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5688 + }, + { + "epoch": 4.543929712460064, + "grad_norm": 0.11631135642528534, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5689 + }, + { + "epoch": 4.544728434504792, + "grad_norm": 0.1848360300064087, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5690 + }, + { + "epoch": 4.545527156549521, + "grad_norm": 0.17804184556007385, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 5691 + }, + { + "epoch": 4.546325878594249, + "grad_norm": 0.2214183509349823, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5692 + }, + { + "epoch": 4.547124600638978, + "grad_norm": 16.448396682739258, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5693 + }, + { + "epoch": 4.547923322683706, + "grad_norm": 0.4933917224407196, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 5694 + }, + { + "epoch": 4.548722044728435, + "grad_norm": 0.41254448890686035, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 5695 + }, + { + "epoch": 4.549520766773163, + "grad_norm": 0.28898510336875916, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 5696 + }, + { + "epoch": 4.550319488817891, + "grad_norm": 0.2938457727432251, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5697 + }, + { + "epoch": 4.55111821086262, + "grad_norm": 0.2264672964811325, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5698 + }, + { + "epoch": 4.551916932907348, + "grad_norm": 0.12931588292121887, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5699 + }, + { + "epoch": 4.552715654952077, + "grad_norm": 0.22106601297855377, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5700 + }, + { + "epoch": 4.553514376996805, + "grad_norm": 0.31875962018966675, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5701 + }, + { + "epoch": 4.554313099041534, + "grad_norm": 0.3129211962223053, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5702 + }, + { + "epoch": 4.555111821086262, + "grad_norm": 0.1613578200340271, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5703 + }, + { + "epoch": 4.555910543130991, + "grad_norm": 0.6340786814689636, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 5704 + }, + { + "epoch": 4.556709265175719, + "grad_norm": 0.13203595578670502, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5705 + }, + { + "epoch": 4.557507987220447, + "grad_norm": 0.16561077535152435, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5706 + }, + { + "epoch": 4.5583067092651754, + "grad_norm": 0.17777414619922638, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 5707 + }, + { + "epoch": 4.559105431309904, + "grad_norm": 0.6985258460044861, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 5708 + }, + { + "epoch": 4.5599041533546325, + "grad_norm": 0.18673790991306305, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5709 + }, + { + "epoch": 4.560702875399361, + "grad_norm": 0.10636870563030243, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5710 + }, + { + "epoch": 4.56150159744409, + "grad_norm": 0.1719052493572235, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5711 + }, + { + "epoch": 4.562300319488818, + "grad_norm": 0.7030455470085144, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5712 + }, + { + "epoch": 4.563099041533547, + "grad_norm": 0.1482628434896469, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 5713 + }, + { + "epoch": 4.563897763578275, + "grad_norm": 0.1585852950811386, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5714 + }, + { + "epoch": 4.564696485623003, + "grad_norm": 0.16067056357860565, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5715 + }, + { + "epoch": 4.565495207667731, + "grad_norm": 0.16162389516830444, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5716 + }, + { + "epoch": 4.56629392971246, + "grad_norm": 0.07224202156066895, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5717 + }, + { + "epoch": 4.567092651757188, + "grad_norm": 0.2577751576900482, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5718 + }, + { + "epoch": 4.567891373801917, + "grad_norm": 1.676942229270935, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5719 + }, + { + "epoch": 4.568690095846645, + "grad_norm": 0.11058419197797775, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5720 + }, + { + "epoch": 4.569488817891374, + "grad_norm": 0.23155376315116882, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5721 + }, + { + "epoch": 4.5702875399361025, + "grad_norm": 0.1197747215628624, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5722 + }, + { + "epoch": 4.571086261980831, + "grad_norm": 0.5179840326309204, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5723 + }, + { + "epoch": 4.571884984025559, + "grad_norm": 0.17717961966991425, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5724 + }, + { + "epoch": 4.572683706070287, + "grad_norm": 0.1513422429561615, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5725 + }, + { + "epoch": 4.573482428115016, + "grad_norm": 0.15495018661022186, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5726 + }, + { + "epoch": 4.574281150159744, + "grad_norm": 3.4248743057250977, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5727 + }, + { + "epoch": 4.575079872204473, + "grad_norm": 0.29529228806495667, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5728 + }, + { + "epoch": 4.575878594249201, + "grad_norm": 0.21125876903533936, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 5729 + }, + { + "epoch": 4.57667731629393, + "grad_norm": 0.16381484270095825, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5730 + }, + { + "epoch": 4.577476038338658, + "grad_norm": 0.2144167572259903, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 5731 + }, + { + "epoch": 4.578274760383387, + "grad_norm": 0.1564428210258484, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5732 + }, + { + "epoch": 4.5790734824281145, + "grad_norm": 0.21137529611587524, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5733 + }, + { + "epoch": 4.579872204472844, + "grad_norm": 0.13836248219013214, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5734 + }, + { + "epoch": 4.580670926517572, + "grad_norm": 0.11749537289142609, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5735 + }, + { + "epoch": 4.5814696485623, + "grad_norm": 0.10901704430580139, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5736 + }, + { + "epoch": 4.582268370607029, + "grad_norm": 0.08402425795793533, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5737 + }, + { + "epoch": 4.583067092651757, + "grad_norm": 0.1502164900302887, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5738 + }, + { + "epoch": 4.583865814696486, + "grad_norm": 0.10606876760721207, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5739 + }, + { + "epoch": 4.584664536741214, + "grad_norm": 0.11868279427289963, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5740 + }, + { + "epoch": 4.585463258785943, + "grad_norm": 0.10678767412900925, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5741 + }, + { + "epoch": 4.586261980830671, + "grad_norm": 0.28886285424232483, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5742 + }, + { + "epoch": 4.5870607028754, + "grad_norm": 0.3516097366809845, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5743 + }, + { + "epoch": 4.587859424920127, + "grad_norm": 0.10221854597330093, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5744 + }, + { + "epoch": 4.588658146964856, + "grad_norm": 0.24786177277565002, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5745 + }, + { + "epoch": 4.5894568690095845, + "grad_norm": 0.10537181794643402, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5746 + }, + { + "epoch": 4.590255591054313, + "grad_norm": 0.23574885725975037, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5747 + }, + { + "epoch": 4.5910543130990416, + "grad_norm": 0.1483563631772995, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5748 + }, + { + "epoch": 4.59185303514377, + "grad_norm": 0.1516815721988678, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 5749 + }, + { + "epoch": 4.592651757188499, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5750 + }, + { + "epoch": 4.593450479233227, + "grad_norm": 0.10706239938735962, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5751 + }, + { + "epoch": 4.594249201277956, + "grad_norm": 1.081868290901184, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5752 + }, + { + "epoch": 4.595047923322683, + "grad_norm": 0.4016919732093811, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5753 + }, + { + "epoch": 4.595846645367412, + "grad_norm": 0.3266371786594391, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5754 + }, + { + "epoch": 4.59664536741214, + "grad_norm": 0.23380769789218903, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5755 + }, + { + "epoch": 4.597444089456869, + "grad_norm": 0.2521349787712097, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 5756 + }, + { + "epoch": 4.598242811501597, + "grad_norm": 0.2223331481218338, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5757 + }, + { + "epoch": 4.599041533546326, + "grad_norm": 0.177442729473114, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 5758 + }, + { + "epoch": 4.5998402555910545, + "grad_norm": 0.18474844098091125, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 5759 + }, + { + "epoch": 4.600638977635783, + "grad_norm": 0.1686495542526245, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5760 + }, + { + "epoch": 4.6014376996805115, + "grad_norm": 0.13674414157867432, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5761 + }, + { + "epoch": 4.602236421725239, + "grad_norm": 0.1390203833580017, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 5762 + }, + { + "epoch": 4.603035143769968, + "grad_norm": 0.10701096057891846, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5763 + }, + { + "epoch": 4.603833865814696, + "grad_norm": 0.110149085521698, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5764 + }, + { + "epoch": 4.604632587859425, + "grad_norm": 0.2477579116821289, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5765 + }, + { + "epoch": 4.605431309904153, + "grad_norm": 0.2554718852043152, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5766 + }, + { + "epoch": 4.606230031948882, + "grad_norm": 0.1945963203907013, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5767 + }, + { + "epoch": 4.60702875399361, + "grad_norm": 0.26785531640052795, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5768 + }, + { + "epoch": 4.607827476038339, + "grad_norm": 0.3007332980632782, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5769 + }, + { + "epoch": 4.608626198083067, + "grad_norm": 0.09973788261413574, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5770 + }, + { + "epoch": 4.609424920127795, + "grad_norm": 0.09176181256771088, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5771 + }, + { + "epoch": 4.6102236421725244, + "grad_norm": 0.1395607590675354, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5772 + }, + { + "epoch": 4.611022364217252, + "grad_norm": 0.8938566446304321, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5773 + }, + { + "epoch": 4.611821086261981, + "grad_norm": 0.3093889653682709, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5774 + }, + { + "epoch": 4.612619808306709, + "grad_norm": 0.1910911351442337, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5775 + }, + { + "epoch": 4.613418530351438, + "grad_norm": 0.11586496978998184, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5776 + }, + { + "epoch": 4.614217252396166, + "grad_norm": 0.222470223903656, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 5777 + }, + { + "epoch": 4.615015974440895, + "grad_norm": 0.16580955684185028, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5778 + }, + { + "epoch": 4.615814696485623, + "grad_norm": 0.11279458552598953, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5779 + }, + { + "epoch": 4.616613418530352, + "grad_norm": 0.10970400273799896, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5780 + }, + { + "epoch": 4.61741214057508, + "grad_norm": 0.11291752755641937, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5781 + }, + { + "epoch": 4.618210862619808, + "grad_norm": 0.19262762367725372, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5782 + }, + { + "epoch": 4.6190095846645365, + "grad_norm": 0.12736102938652039, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5783 + }, + { + "epoch": 4.619808306709265, + "grad_norm": 0.09300720691680908, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5784 + }, + { + "epoch": 4.6206070287539935, + "grad_norm": 0.09544654190540314, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5785 + }, + { + "epoch": 4.621405750798722, + "grad_norm": 0.2888239026069641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5786 + }, + { + "epoch": 4.622204472843451, + "grad_norm": 0.22988484799861908, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5787 + }, + { + "epoch": 4.623003194888179, + "grad_norm": 0.2574143707752228, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5788 + }, + { + "epoch": 4.623801916932908, + "grad_norm": 0.2503221333026886, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5789 + }, + { + "epoch": 4.624600638977636, + "grad_norm": 0.20846052467823029, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5790 + }, + { + "epoch": 4.625399361022364, + "grad_norm": 0.218403160572052, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5791 + }, + { + "epoch": 4.626198083067092, + "grad_norm": 0.11333920061588287, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5792 + }, + { + "epoch": 4.626996805111821, + "grad_norm": 0.19022895395755768, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5793 + }, + { + "epoch": 4.627795527156549, + "grad_norm": 0.1525644063949585, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 5794 + }, + { + "epoch": 4.628594249201278, + "grad_norm": 0.07636452466249466, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5795 + }, + { + "epoch": 4.6293929712460065, + "grad_norm": 0.1358552873134613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5796 + }, + { + "epoch": 4.630191693290735, + "grad_norm": 0.08993138372898102, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5797 + }, + { + "epoch": 4.6309904153354635, + "grad_norm": 0.15454545617103577, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5798 + }, + { + "epoch": 4.631789137380192, + "grad_norm": 0.12256992608308792, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5799 + }, + { + "epoch": 4.63258785942492, + "grad_norm": 0.08453187346458435, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5800 + }, + { + "epoch": 4.633386581469648, + "grad_norm": 0.1474936157464981, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5801 + }, + { + "epoch": 4.634185303514377, + "grad_norm": 0.11481066793203354, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5802 + }, + { + "epoch": 4.634984025559105, + "grad_norm": 0.41141587495803833, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5803 + }, + { + "epoch": 4.635782747603834, + "grad_norm": 0.1509549766778946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5804 + }, + { + "epoch": 4.636581469648562, + "grad_norm": 0.13562771677970886, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5805 + }, + { + "epoch": 4.637380191693291, + "grad_norm": 0.09722459316253662, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5806 + }, + { + "epoch": 4.638178913738019, + "grad_norm": 0.3194493353366852, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5807 + }, + { + "epoch": 4.638977635782748, + "grad_norm": 0.23091651499271393, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5808 + }, + { + "epoch": 4.6397763578274756, + "grad_norm": 0.1682155877351761, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5809 + }, + { + "epoch": 4.640575079872205, + "grad_norm": 0.37293288111686707, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5810 + }, + { + "epoch": 4.641373801916933, + "grad_norm": 0.3746488094329834, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5811 + }, + { + "epoch": 4.642172523961661, + "grad_norm": 0.2068052738904953, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5812 + }, + { + "epoch": 4.64297124600639, + "grad_norm": 0.13229581713676453, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5813 + }, + { + "epoch": 4.643769968051118, + "grad_norm": 0.24158459901809692, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5814 + }, + { + "epoch": 4.644568690095847, + "grad_norm": 0.4241867959499359, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5815 + }, + { + "epoch": 4.645367412140575, + "grad_norm": 0.40008923411369324, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5816 + }, + { + "epoch": 4.646166134185304, + "grad_norm": 0.3150584101676941, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 5817 + }, + { + "epoch": 4.646964856230032, + "grad_norm": 0.11021434515714645, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5818 + }, + { + "epoch": 4.647763578274761, + "grad_norm": 0.30061402916908264, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5819 + }, + { + "epoch": 4.6485623003194885, + "grad_norm": 0.12583592534065247, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5820 + }, + { + "epoch": 4.649361022364217, + "grad_norm": 0.31917983293533325, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5821 + }, + { + "epoch": 4.6501597444089455, + "grad_norm": 0.2097153663635254, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5822 + }, + { + "epoch": 4.650958466453674, + "grad_norm": 0.19847621023654938, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5823 + }, + { + "epoch": 4.651757188498403, + "grad_norm": 0.2482050508260727, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5824 + }, + { + "epoch": 4.652555910543131, + "grad_norm": 0.1257491409778595, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5825 + }, + { + "epoch": 4.65335463258786, + "grad_norm": 0.2192201465368271, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5826 + }, + { + "epoch": 4.654153354632588, + "grad_norm": 0.16453656554222107, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5827 + }, + { + "epoch": 4.654952076677317, + "grad_norm": 0.18813923001289368, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5828 + }, + { + "epoch": 4.655750798722044, + "grad_norm": 0.1811141073703766, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5829 + }, + { + "epoch": 4.656549520766773, + "grad_norm": 0.08911352604627609, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5830 + }, + { + "epoch": 4.657348242811501, + "grad_norm": 0.17858019471168518, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5831 + }, + { + "epoch": 4.65814696485623, + "grad_norm": 0.27315759658813477, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5832 + }, + { + "epoch": 4.6589456869009584, + "grad_norm": 0.18612337112426758, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5833 + }, + { + "epoch": 4.659744408945687, + "grad_norm": 0.2646125257015228, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5834 + }, + { + "epoch": 4.6605431309904155, + "grad_norm": 0.07320903241634369, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5835 + }, + { + "epoch": 4.661341853035144, + "grad_norm": 0.12969297170639038, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5836 + }, + { + "epoch": 4.662140575079873, + "grad_norm": 0.37665078043937683, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5837 + }, + { + "epoch": 4.6629392971246, + "grad_norm": 0.11055029928684235, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 5838 + }, + { + "epoch": 4.663738019169329, + "grad_norm": 0.12279482185840607, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5839 + }, + { + "epoch": 4.664536741214057, + "grad_norm": 0.0686316192150116, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5840 + }, + { + "epoch": 4.665335463258786, + "grad_norm": 0.09705425798892975, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5841 + }, + { + "epoch": 4.666134185303514, + "grad_norm": 0.09543570131063461, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5842 + }, + { + "epoch": 4.666932907348243, + "grad_norm": 0.08460460603237152, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5843 + }, + { + "epoch": 4.667731629392971, + "grad_norm": 0.12419378757476807, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5844 + }, + { + "epoch": 4.6685303514377, + "grad_norm": 0.09184019267559052, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5845 + }, + { + "epoch": 4.669329073482428, + "grad_norm": 0.09425100684165955, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5846 + }, + { + "epoch": 4.670127795527156, + "grad_norm": 0.19701971113681793, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5847 + }, + { + "epoch": 4.6709265175718855, + "grad_norm": 0.0648239254951477, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5848 + }, + { + "epoch": 4.671725239616613, + "grad_norm": 0.11558888107538223, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5849 + }, + { + "epoch": 4.672523961661342, + "grad_norm": 0.12397976219654083, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5850 + }, + { + "epoch": 4.67332268370607, + "grad_norm": 0.10640132427215576, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5851 + }, + { + "epoch": 4.674121405750799, + "grad_norm": 0.08930578827857971, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5852 + }, + { + "epoch": 4.674920127795527, + "grad_norm": 0.06212310120463371, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5853 + }, + { + "epoch": 4.675718849840256, + "grad_norm": 0.08568188548088074, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5854 + }, + { + "epoch": 4.676517571884984, + "grad_norm": 0.11431021988391876, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5855 + }, + { + "epoch": 4.677316293929713, + "grad_norm": 0.34381258487701416, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5856 + }, + { + "epoch": 4.678115015974441, + "grad_norm": 0.1996181309223175, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5857 + }, + { + "epoch": 4.678913738019169, + "grad_norm": 0.2900290787220001, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5858 + }, + { + "epoch": 4.6797124600638975, + "grad_norm": 0.35768410563468933, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5859 + }, + { + "epoch": 4.680511182108626, + "grad_norm": 0.1027536615729332, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5860 + }, + { + "epoch": 4.681309904153355, + "grad_norm": 0.6286419630050659, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5861 + }, + { + "epoch": 4.682108626198083, + "grad_norm": 0.5037242770195007, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5862 + }, + { + "epoch": 4.682907348242812, + "grad_norm": 0.34654417634010315, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5863 + }, + { + "epoch": 4.68370607028754, + "grad_norm": 0.18139366805553436, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5864 + }, + { + "epoch": 4.684504792332269, + "grad_norm": 0.2101605087518692, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5865 + }, + { + "epoch": 4.685303514376997, + "grad_norm": 0.0922360047698021, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5866 + }, + { + "epoch": 4.686102236421725, + "grad_norm": 0.23476624488830566, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5867 + }, + { + "epoch": 4.686900958466453, + "grad_norm": 0.1843792051076889, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5868 + }, + { + "epoch": 4.687699680511182, + "grad_norm": 0.09449298679828644, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5869 + }, + { + "epoch": 4.68849840255591, + "grad_norm": 0.13996686041355133, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5870 + }, + { + "epoch": 4.689297124600639, + "grad_norm": 2.113325357437134, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5871 + }, + { + "epoch": 4.6900958466453675, + "grad_norm": 0.35181209444999695, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 5872 + }, + { + "epoch": 4.690894568690096, + "grad_norm": 0.3530768156051636, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 5873 + }, + { + "epoch": 4.6916932907348246, + "grad_norm": 0.25919783115386963, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5874 + }, + { + "epoch": 4.692492012779553, + "grad_norm": 0.19770720601081848, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 5875 + }, + { + "epoch": 4.693290734824281, + "grad_norm": 0.32085585594177246, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5876 + }, + { + "epoch": 4.694089456869009, + "grad_norm": 0.14215363562107086, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5877 + }, + { + "epoch": 4.694888178913738, + "grad_norm": 0.24502497911453247, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5878 + }, + { + "epoch": 4.695686900958466, + "grad_norm": 0.15765784680843353, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5879 + }, + { + "epoch": 4.696485623003195, + "grad_norm": 0.13945002853870392, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5880 + }, + { + "epoch": 4.697284345047923, + "grad_norm": 0.16315795481204987, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5881 + }, + { + "epoch": 4.698083067092652, + "grad_norm": 0.0803297907114029, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5882 + }, + { + "epoch": 4.69888178913738, + "grad_norm": 0.09848042577505112, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5883 + }, + { + "epoch": 4.699680511182109, + "grad_norm": 0.22370465099811554, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5884 + }, + { + "epoch": 4.700479233226837, + "grad_norm": 0.09369395673274994, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5885 + }, + { + "epoch": 4.701277955271565, + "grad_norm": 0.42340102791786194, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5886 + }, + { + "epoch": 4.702076677316294, + "grad_norm": 0.08471440523862839, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5887 + }, + { + "epoch": 4.702875399361022, + "grad_norm": 0.11350758373737335, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5888 + }, + { + "epoch": 4.703674121405751, + "grad_norm": 0.16862216591835022, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5889 + }, + { + "epoch": 4.704472843450479, + "grad_norm": 0.17468953132629395, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5890 + }, + { + "epoch": 4.705271565495208, + "grad_norm": 0.09154370427131653, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5891 + }, + { + "epoch": 4.706070287539936, + "grad_norm": 0.08715084940195084, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5892 + }, + { + "epoch": 4.706869009584665, + "grad_norm": 0.06797291338443756, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5893 + }, + { + "epoch": 4.707667731629393, + "grad_norm": 0.17333610355854034, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5894 + }, + { + "epoch": 4.708466453674122, + "grad_norm": 0.17272767424583435, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5895 + }, + { + "epoch": 4.7092651757188495, + "grad_norm": 0.11773357540369034, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5896 + }, + { + "epoch": 4.710063897763578, + "grad_norm": 0.08420758694410324, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5897 + }, + { + "epoch": 4.710862619808307, + "grad_norm": 0.08672801405191422, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5898 + }, + { + "epoch": 4.711661341853035, + "grad_norm": 0.2356635183095932, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5899 + }, + { + "epoch": 4.712460063897764, + "grad_norm": 0.06091082841157913, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5900 + }, + { + "epoch": 4.713258785942492, + "grad_norm": 0.09156842529773712, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5901 + }, + { + "epoch": 4.714057507987221, + "grad_norm": 0.06548108160495758, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5902 + }, + { + "epoch": 4.714856230031949, + "grad_norm": 0.12813016772270203, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5903 + }, + { + "epoch": 4.715654952076678, + "grad_norm": 0.1518833339214325, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5904 + }, + { + "epoch": 4.716453674121405, + "grad_norm": 0.09331580996513367, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5905 + }, + { + "epoch": 4.717252396166134, + "grad_norm": 0.11989843845367432, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5906 + }, + { + "epoch": 4.718051118210862, + "grad_norm": 0.1277054399251938, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5907 + }, + { + "epoch": 4.718849840255591, + "grad_norm": 0.11199159920215607, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5908 + }, + { + "epoch": 4.7196485623003195, + "grad_norm": 0.09120891988277435, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5909 + }, + { + "epoch": 4.720447284345048, + "grad_norm": 0.11668230593204498, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5910 + }, + { + "epoch": 4.7212460063897765, + "grad_norm": 0.08594206720590591, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5911 + }, + { + "epoch": 4.722044728434505, + "grad_norm": 0.11563027650117874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5912 + }, + { + "epoch": 4.722843450479234, + "grad_norm": 0.15066663920879364, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5913 + }, + { + "epoch": 4.723642172523961, + "grad_norm": 0.08566875755786896, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5914 + }, + { + "epoch": 4.72444089456869, + "grad_norm": 0.060813747346401215, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5915 + }, + { + "epoch": 4.725239616613418, + "grad_norm": 0.07391642779111862, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5916 + }, + { + "epoch": 4.726038338658147, + "grad_norm": 0.04867766425013542, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5917 + }, + { + "epoch": 4.726837060702875, + "grad_norm": 0.09468305110931396, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5918 + }, + { + "epoch": 4.727635782747604, + "grad_norm": 0.07287945598363876, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5919 + }, + { + "epoch": 4.728434504792332, + "grad_norm": 0.08984806388616562, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5920 + }, + { + "epoch": 4.729233226837061, + "grad_norm": 0.1755092740058899, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5921 + }, + { + "epoch": 4.7300319488817895, + "grad_norm": 0.09656399488449097, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5922 + }, + { + "epoch": 4.730830670926517, + "grad_norm": 0.15759015083312988, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5923 + }, + { + "epoch": 4.731629392971246, + "grad_norm": 0.13238383829593658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5924 + }, + { + "epoch": 4.732428115015974, + "grad_norm": 0.05352601036429405, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5925 + }, + { + "epoch": 4.733226837060703, + "grad_norm": 0.06253937631845474, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5926 + }, + { + "epoch": 4.734025559105431, + "grad_norm": 0.057317376136779785, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5927 + }, + { + "epoch": 4.73482428115016, + "grad_norm": 0.12154382467269897, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5928 + }, + { + "epoch": 4.735623003194888, + "grad_norm": 0.0547759085893631, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5929 + }, + { + "epoch": 4.736421725239617, + "grad_norm": 0.07446085661649704, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5930 + }, + { + "epoch": 4.737220447284345, + "grad_norm": 0.09809007495641708, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5931 + }, + { + "epoch": 4.738019169329074, + "grad_norm": 0.12434732168912888, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5932 + }, + { + "epoch": 4.738817891373802, + "grad_norm": 0.12192053347826004, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5933 + }, + { + "epoch": 4.73961661341853, + "grad_norm": 0.08006733655929565, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5934 + }, + { + "epoch": 4.7404153354632586, + "grad_norm": 0.14677436649799347, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5935 + }, + { + "epoch": 4.741214057507987, + "grad_norm": 0.10133987665176392, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5936 + }, + { + "epoch": 4.742012779552716, + "grad_norm": 0.10331577062606812, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5937 + }, + { + "epoch": 4.742811501597444, + "grad_norm": 0.14596082270145416, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5938 + }, + { + "epoch": 4.743610223642173, + "grad_norm": 0.15139590203762054, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5939 + }, + { + "epoch": 4.744408945686901, + "grad_norm": 0.0935182124376297, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5940 + }, + { + "epoch": 4.74520766773163, + "grad_norm": 0.1002865880727768, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5941 + }, + { + "epoch": 4.746006389776358, + "grad_norm": 0.0968283861875534, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5942 + }, + { + "epoch": 4.746805111821086, + "grad_norm": 0.11680585891008377, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5943 + }, + { + "epoch": 4.747603833865814, + "grad_norm": 0.12163184583187103, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 5944 + }, + { + "epoch": 4.748402555910543, + "grad_norm": 0.07288502901792526, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5945 + }, + { + "epoch": 4.7492012779552715, + "grad_norm": 0.3335740566253662, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5946 + }, + { + "epoch": 4.75, + "grad_norm": 0.15408654510974884, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5947 + }, + { + "epoch": 4.7507987220447285, + "grad_norm": 0.09612353891134262, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5948 + }, + { + "epoch": 4.751597444089457, + "grad_norm": 0.10403789579868317, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5949 + }, + { + "epoch": 4.752396166134186, + "grad_norm": 0.13026492297649384, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5950 + }, + { + "epoch": 4.753194888178914, + "grad_norm": 0.061955004930496216, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5951 + }, + { + "epoch": 4.753993610223642, + "grad_norm": 0.08264514058828354, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5952 + }, + { + "epoch": 4.75479233226837, + "grad_norm": 0.1132993996143341, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5953 + }, + { + "epoch": 4.755591054313099, + "grad_norm": 0.09022228419780731, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5954 + }, + { + "epoch": 4.756389776357827, + "grad_norm": 0.13192631304264069, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5955 + }, + { + "epoch": 4.757188498402556, + "grad_norm": 0.08400337398052216, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5956 + }, + { + "epoch": 4.757987220447284, + "grad_norm": 0.05070018023252487, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5957 + }, + { + "epoch": 4.758785942492013, + "grad_norm": 0.09561482816934586, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5958 + }, + { + "epoch": 4.7595846645367414, + "grad_norm": 0.07369764894247055, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5959 + }, + { + "epoch": 4.76038338658147, + "grad_norm": 0.07777421176433563, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5960 + }, + { + "epoch": 4.761182108626198, + "grad_norm": 0.11525892466306686, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5961 + }, + { + "epoch": 4.761980830670926, + "grad_norm": 0.1788506656885147, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5962 + }, + { + "epoch": 4.762779552715655, + "grad_norm": 0.10067635029554367, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5963 + }, + { + "epoch": 4.763578274760383, + "grad_norm": 0.08447863161563873, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5964 + }, + { + "epoch": 4.764376996805112, + "grad_norm": 0.06801758706569672, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5965 + }, + { + "epoch": 4.76517571884984, + "grad_norm": 0.07363327592611313, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5966 + }, + { + "epoch": 4.765974440894569, + "grad_norm": 0.05584784597158432, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5967 + }, + { + "epoch": 4.766773162939297, + "grad_norm": 0.10064459592103958, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5968 + }, + { + "epoch": 4.767571884984026, + "grad_norm": 0.1176871508359909, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5969 + }, + { + "epoch": 4.768370607028754, + "grad_norm": 0.17485690116882324, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5970 + }, + { + "epoch": 4.769169329073483, + "grad_norm": 0.15753531455993652, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5971 + }, + { + "epoch": 4.7699680511182105, + "grad_norm": 0.1669864058494568, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5972 + }, + { + "epoch": 4.770766773162939, + "grad_norm": 0.07706131786108017, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5973 + }, + { + "epoch": 4.771565495207668, + "grad_norm": 0.3537883460521698, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5974 + }, + { + "epoch": 4.772364217252396, + "grad_norm": 0.20092372596263885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5975 + }, + { + "epoch": 4.773162939297125, + "grad_norm": 0.06521142274141312, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5976 + }, + { + "epoch": 4.773961661341853, + "grad_norm": 0.1203140988945961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5977 + }, + { + "epoch": 4.774760383386582, + "grad_norm": 0.09655500948429108, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5978 + }, + { + "epoch": 4.77555910543131, + "grad_norm": 0.09220302104949951, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5979 + }, + { + "epoch": 4.776357827476039, + "grad_norm": 0.7336251735687256, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5980 + }, + { + "epoch": 4.777156549520766, + "grad_norm": 0.21415477991104126, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5981 + }, + { + "epoch": 4.777955271565495, + "grad_norm": 0.14869220554828644, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5982 + }, + { + "epoch": 4.7787539936102235, + "grad_norm": 0.0779772400856018, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5983 + }, + { + "epoch": 4.779552715654952, + "grad_norm": 0.14274317026138306, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5984 + }, + { + "epoch": 4.7803514376996805, + "grad_norm": 0.11580413579940796, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5985 + }, + { + "epoch": 4.781150159744409, + "grad_norm": 0.055023401975631714, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5986 + }, + { + "epoch": 4.781948881789138, + "grad_norm": 0.11657343804836273, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5987 + }, + { + "epoch": 4.782747603833866, + "grad_norm": 0.07336080819368362, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5988 + }, + { + "epoch": 4.783546325878595, + "grad_norm": 0.06066504120826721, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5989 + }, + { + "epoch": 4.784345047923322, + "grad_norm": 0.05784285068511963, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5990 + }, + { + "epoch": 4.785143769968051, + "grad_norm": 0.06317969411611557, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5991 + }, + { + "epoch": 4.785942492012779, + "grad_norm": 0.1001245379447937, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5992 + }, + { + "epoch": 4.786741214057508, + "grad_norm": 0.0743420347571373, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5993 + }, + { + "epoch": 4.787539936102236, + "grad_norm": 0.07082799077033997, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5994 + }, + { + "epoch": 4.788338658146965, + "grad_norm": 0.11087984591722488, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5995 + }, + { + "epoch": 4.789137380191693, + "grad_norm": 0.05923386290669441, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5996 + }, + { + "epoch": 4.789936102236422, + "grad_norm": 0.1020246297121048, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5997 + }, + { + "epoch": 4.7907348242811505, + "grad_norm": 0.11524185538291931, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5998 + }, + { + "epoch": 4.791533546325878, + "grad_norm": 0.06959006190299988, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5999 + }, + { + "epoch": 4.792332268370607, + "grad_norm": 0.19179846346378326, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6000 + }, + { + "epoch": 4.793130990415335, + "grad_norm": 0.17232562601566315, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6001 + }, + { + "epoch": 4.793929712460064, + "grad_norm": 0.7047739028930664, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6002 + }, + { + "epoch": 4.794728434504792, + "grad_norm": 0.09086379408836365, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6003 + }, + { + "epoch": 4.795527156549521, + "grad_norm": 0.17785955965518951, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6004 + }, + { + "epoch": 4.796325878594249, + "grad_norm": 0.09529274702072144, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6005 + }, + { + "epoch": 4.797124600638978, + "grad_norm": 0.08041567355394363, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6006 + }, + { + "epoch": 4.797923322683706, + "grad_norm": 0.13888375461101532, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6007 + }, + { + "epoch": 4.798722044728435, + "grad_norm": 0.08110564947128296, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6008 + }, + { + "epoch": 4.799520766773163, + "grad_norm": 0.07443006336688995, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6009 + }, + { + "epoch": 4.800319488817891, + "grad_norm": 0.08499104529619217, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6010 + }, + { + "epoch": 4.80111821086262, + "grad_norm": 0.0616084523499012, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6011 + }, + { + "epoch": 4.801916932907348, + "grad_norm": 0.10845918208360672, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6012 + }, + { + "epoch": 4.802715654952077, + "grad_norm": 0.057658810168504715, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6013 + }, + { + "epoch": 4.803514376996805, + "grad_norm": 0.07163018733263016, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6014 + }, + { + "epoch": 4.804313099041534, + "grad_norm": 0.07016896456480026, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6015 + }, + { + "epoch": 4.805111821086262, + "grad_norm": 0.08233597129583359, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6016 + }, + { + "epoch": 4.805910543130991, + "grad_norm": 0.05408332124352455, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6017 + }, + { + "epoch": 4.806709265175719, + "grad_norm": 0.0886560007929802, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6018 + }, + { + "epoch": 4.807507987220447, + "grad_norm": 0.17860093712806702, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6019 + }, + { + "epoch": 4.8083067092651754, + "grad_norm": 0.26264694333076477, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6020 + }, + { + "epoch": 4.809105431309904, + "grad_norm": 0.08523311465978622, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6021 + }, + { + "epoch": 4.8099041533546325, + "grad_norm": 0.09873831272125244, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6022 + }, + { + "epoch": 4.810702875399361, + "grad_norm": 0.16135412454605103, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6023 + }, + { + "epoch": 4.81150159744409, + "grad_norm": 0.08003875613212585, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6024 + }, + { + "epoch": 4.812300319488818, + "grad_norm": 0.09117014706134796, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6025 + }, + { + "epoch": 4.813099041533547, + "grad_norm": 0.2316243052482605, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6026 + }, + { + "epoch": 4.813897763578275, + "grad_norm": 0.16050362586975098, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6027 + }, + { + "epoch": 4.814696485623003, + "grad_norm": 0.13559919595718384, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6028 + }, + { + "epoch": 4.815495207667731, + "grad_norm": 0.08917123824357986, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6029 + }, + { + "epoch": 4.81629392971246, + "grad_norm": 0.11498702317476273, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6030 + }, + { + "epoch": 4.817092651757188, + "grad_norm": 0.14677700400352478, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6031 + }, + { + "epoch": 4.817891373801917, + "grad_norm": 0.08849102258682251, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6032 + }, + { + "epoch": 4.818690095846645, + "grad_norm": 4.0974507331848145, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6033 + }, + { + "epoch": 4.819488817891374, + "grad_norm": 0.24215161800384521, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6034 + }, + { + "epoch": 4.8202875399361025, + "grad_norm": 0.2679882049560547, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6035 + }, + { + "epoch": 4.821086261980831, + "grad_norm": 0.11113203316926956, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6036 + }, + { + "epoch": 4.821884984025559, + "grad_norm": 0.17725592851638794, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6037 + }, + { + "epoch": 4.822683706070287, + "grad_norm": 0.08446694165468216, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6038 + }, + { + "epoch": 4.823482428115016, + "grad_norm": 0.26757946610450745, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6039 + }, + { + "epoch": 4.824281150159744, + "grad_norm": 0.1900561898946762, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6040 + }, + { + "epoch": 4.825079872204473, + "grad_norm": 0.21993426978588104, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6041 + }, + { + "epoch": 4.825878594249201, + "grad_norm": 15.862943649291992, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6042 + }, + { + "epoch": 4.82667731629393, + "grad_norm": 0.793515145778656, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6043 + }, + { + "epoch": 4.827476038338658, + "grad_norm": 0.5607691407203674, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6044 + }, + { + "epoch": 4.828274760383387, + "grad_norm": 0.2853091359138489, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6045 + }, + { + "epoch": 4.8290734824281145, + "grad_norm": 0.3579944670200348, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6046 + }, + { + "epoch": 4.829872204472844, + "grad_norm": 0.26784929633140564, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6047 + }, + { + "epoch": 4.830670926517572, + "grad_norm": 0.2363428920507431, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6048 + }, + { + "epoch": 4.8314696485623, + "grad_norm": 0.2922425866127014, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6049 + }, + { + "epoch": 4.832268370607029, + "grad_norm": 0.2173125147819519, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6050 + }, + { + "epoch": 4.833067092651757, + "grad_norm": 0.23552696406841278, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6051 + }, + { + "epoch": 4.833865814696486, + "grad_norm": 1.2383053302764893, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6052 + }, + { + "epoch": 4.834664536741214, + "grad_norm": 0.3284873366355896, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6053 + }, + { + "epoch": 4.835463258785943, + "grad_norm": 0.15584628283977509, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6054 + }, + { + "epoch": 4.836261980830671, + "grad_norm": 0.3136327862739563, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6055 + }, + { + "epoch": 4.8370607028754, + "grad_norm": 0.19863441586494446, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6056 + }, + { + "epoch": 4.837859424920127, + "grad_norm": 0.273644357919693, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6057 + }, + { + "epoch": 4.838658146964856, + "grad_norm": 0.2560950517654419, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6058 + }, + { + "epoch": 4.8394568690095845, + "grad_norm": 0.2243220955133438, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6059 + }, + { + "epoch": 4.840255591054313, + "grad_norm": 0.16328522562980652, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6060 + }, + { + "epoch": 4.8410543130990416, + "grad_norm": 0.42267754673957825, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6061 + }, + { + "epoch": 4.84185303514377, + "grad_norm": 0.21733495593070984, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6062 + }, + { + "epoch": 4.842651757188499, + "grad_norm": 0.12917862832546234, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6063 + }, + { + "epoch": 4.843450479233227, + "grad_norm": 0.1829921007156372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6064 + }, + { + "epoch": 4.844249201277956, + "grad_norm": 0.08751819282770157, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6065 + }, + { + "epoch": 4.845047923322683, + "grad_norm": 0.16521455347537994, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6066 + }, + { + "epoch": 4.845846645367412, + "grad_norm": 0.4328543543815613, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6067 + }, + { + "epoch": 4.84664536741214, + "grad_norm": 0.2682073712348938, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6068 + }, + { + "epoch": 4.847444089456869, + "grad_norm": 0.15217293798923492, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6069 + }, + { + "epoch": 4.848242811501597, + "grad_norm": 0.12807190418243408, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 6070 + }, + { + "epoch": 4.849041533546326, + "grad_norm": 1.4503207206726074, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6071 + }, + { + "epoch": 4.8498402555910545, + "grad_norm": 0.5045278668403625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6072 + }, + { + "epoch": 4.850638977635783, + "grad_norm": 0.1992882788181305, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6073 + }, + { + "epoch": 4.8514376996805115, + "grad_norm": 0.3178166151046753, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 6074 + }, + { + "epoch": 4.852236421725239, + "grad_norm": 0.1244354322552681, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6075 + }, + { + "epoch": 4.853035143769968, + "grad_norm": 0.2837885320186615, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 6076 + }, + { + "epoch": 4.853833865814696, + "grad_norm": 0.11910229921340942, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6077 + }, + { + "epoch": 4.854632587859425, + "grad_norm": 0.5774815678596497, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6078 + }, + { + "epoch": 4.855431309904153, + "grad_norm": 0.13028140366077423, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6079 + }, + { + "epoch": 4.856230031948882, + "grad_norm": 0.21022816002368927, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6080 + }, + { + "epoch": 4.85702875399361, + "grad_norm": 0.11758062243461609, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6081 + }, + { + "epoch": 4.857827476038339, + "grad_norm": 0.1321621984243393, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6082 + }, + { + "epoch": 4.858626198083067, + "grad_norm": 0.11481605470180511, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6083 + }, + { + "epoch": 4.859424920127795, + "grad_norm": 0.0976998507976532, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6084 + }, + { + "epoch": 4.8602236421725244, + "grad_norm": 0.7211679220199585, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6085 + }, + { + "epoch": 4.861022364217252, + "grad_norm": 0.1417546272277832, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6086 + }, + { + "epoch": 4.861821086261981, + "grad_norm": 0.13830699026584625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6087 + }, + { + "epoch": 4.862619808306709, + "grad_norm": 0.24840030074119568, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6088 + }, + { + "epoch": 4.863418530351438, + "grad_norm": 3.442054033279419, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6089 + }, + { + "epoch": 4.864217252396166, + "grad_norm": 0.21404840052127838, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 6090 + }, + { + "epoch": 4.865015974440895, + "grad_norm": 0.3657711148262024, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6091 + }, + { + "epoch": 4.865814696485623, + "grad_norm": 0.2189537137746811, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6092 + }, + { + "epoch": 4.866613418530352, + "grad_norm": 0.17866109311580658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6093 + }, + { + "epoch": 4.86741214057508, + "grad_norm": 0.19208978116512299, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6094 + }, + { + "epoch": 4.868210862619808, + "grad_norm": 0.08330709487199783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6095 + }, + { + "epoch": 4.8690095846645365, + "grad_norm": 0.1194678544998169, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6096 + }, + { + "epoch": 4.869808306709265, + "grad_norm": 0.07852908223867416, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6097 + }, + { + "epoch": 4.8706070287539935, + "grad_norm": 0.09230814129114151, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 6098 + }, + { + "epoch": 4.871405750798722, + "grad_norm": 0.06775277107954025, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6099 + }, + { + "epoch": 4.872204472843451, + "grad_norm": 0.28747716546058655, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6100 + }, + { + "epoch": 4.873003194888179, + "grad_norm": 0.11956486105918884, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6101 + }, + { + "epoch": 4.873801916932908, + "grad_norm": 0.09843557327985764, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6102 + }, + { + "epoch": 4.874600638977636, + "grad_norm": 0.08408313244581223, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6103 + }, + { + "epoch": 4.875399361022364, + "grad_norm": 0.08230917155742645, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6104 + }, + { + "epoch": 4.876198083067092, + "grad_norm": 0.08927451819181442, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6105 + }, + { + "epoch": 4.876996805111821, + "grad_norm": 0.5961875319480896, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6106 + }, + { + "epoch": 4.877795527156549, + "grad_norm": 0.5851842164993286, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 6107 + }, + { + "epoch": 4.878594249201278, + "grad_norm": 0.4428717792034149, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6108 + }, + { + "epoch": 4.8793929712460065, + "grad_norm": 3.760467052459717, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 6109 + }, + { + "epoch": 4.880191693290735, + "grad_norm": 84.49950408935547, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 6110 + }, + { + "epoch": 4.8809904153354635, + "grad_norm": 66320516.0, + "learning_rate": 0.0005, + "loss": 1.1423, + "step": 6111 + }, + { + "epoch": 4.881789137380192, + "grad_norm": 676613568.0, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 6112 + }, + { + "epoch": 4.88258785942492, + "grad_norm": 2556641280.0, + "learning_rate": 0.0005, + "loss": 1.2458, + "step": 6113 + }, + { + "epoch": 4.883386581469648, + "grad_norm": 21960.341796875, + "learning_rate": 0.0005, + "loss": 1.3163, + "step": 6114 + }, + { + "epoch": 4.884185303514377, + "grad_norm": 3668.3603515625, + "learning_rate": 0.0005, + "loss": 1.4954, + "step": 6115 + }, + { + "epoch": 4.884984025559105, + "grad_norm": 9.501830101013184, + "learning_rate": 0.0005, + "loss": 2.0388, + "step": 6116 + }, + { + "epoch": 4.885782747603834, + "grad_norm": 1.9570647478103638, + "learning_rate": 0.0005, + "loss": 1.3693, + "step": 6117 + }, + { + "epoch": 4.886581469648562, + "grad_norm": 0.9678036570549011, + "learning_rate": 0.0005, + "loss": 1.2694, + "step": 6118 + }, + { + "epoch": 4.887380191693291, + "grad_norm": 0.7094120383262634, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 6119 + }, + { + "epoch": 4.888178913738019, + "grad_norm": 0.4029041826725006, + "learning_rate": 0.0005, + "loss": 1.1809, + "step": 6120 + }, + { + "epoch": 4.888977635782748, + "grad_norm": 0.8682520389556885, + "learning_rate": 0.0005, + "loss": 1.1689, + "step": 6121 + }, + { + "epoch": 4.8897763578274756, + "grad_norm": 0.5829207301139832, + "learning_rate": 0.0005, + "loss": 1.1921, + "step": 6122 + }, + { + "epoch": 4.890575079872205, + "grad_norm": 0.5038579702377319, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6123 + }, + { + "epoch": 4.891373801916933, + "grad_norm": 0.532597005367279, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6124 + }, + { + "epoch": 4.892172523961661, + "grad_norm": 0.20122192800045013, + "learning_rate": 0.0005, + "loss": 1.1399, + "step": 6125 + }, + { + "epoch": 4.89297124600639, + "grad_norm": 0.22419369220733643, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 6126 + }, + { + "epoch": 4.893769968051118, + "grad_norm": 0.2319759726524353, + "learning_rate": 0.0005, + "loss": 1.13, + "step": 6127 + }, + { + "epoch": 4.894568690095847, + "grad_norm": 0.18733178079128265, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 6128 + }, + { + "epoch": 4.895367412140575, + "grad_norm": 0.35497167706489563, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 6129 + }, + { + "epoch": 4.896166134185304, + "grad_norm": 0.2551584243774414, + "learning_rate": 0.0005, + "loss": 1.1236, + "step": 6130 + }, + { + "epoch": 4.896964856230032, + "grad_norm": 0.337982714176178, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 6131 + }, + { + "epoch": 4.897763578274761, + "grad_norm": 0.2945634722709656, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 6132 + }, + { + "epoch": 4.8985623003194885, + "grad_norm": 0.2571047842502594, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 6133 + }, + { + "epoch": 4.899361022364217, + "grad_norm": 0.23297041654586792, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 6134 + }, + { + "epoch": 4.9001597444089455, + "grad_norm": 0.24131764471530914, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 6135 + }, + { + "epoch": 4.900958466453674, + "grad_norm": 0.22283275425434113, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 6136 + }, + { + "epoch": 4.901757188498403, + "grad_norm": 0.1691826730966568, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 6137 + }, + { + "epoch": 4.902555910543131, + "grad_norm": 0.1532466858625412, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 6138 + }, + { + "epoch": 4.90335463258786, + "grad_norm": 0.14135177433490753, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 6139 + }, + { + "epoch": 4.904153354632588, + "grad_norm": 0.14410537481307983, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 6140 + }, + { + "epoch": 4.904952076677317, + "grad_norm": 0.1097448468208313, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 6141 + }, + { + "epoch": 4.905750798722044, + "grad_norm": 0.0851673111319542, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 6142 + }, + { + "epoch": 4.906549520766773, + "grad_norm": 0.13842107355594635, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 6143 + }, + { + "epoch": 4.907348242811501, + "grad_norm": 0.15126317739486694, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 6144 + }, + { + "epoch": 4.90814696485623, + "grad_norm": 0.13176177442073822, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6145 + }, + { + "epoch": 4.9089456869009584, + "grad_norm": 0.164788156747818, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 6146 + }, + { + "epoch": 4.909744408945687, + "grad_norm": 0.24943718314170837, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6147 + }, + { + "epoch": 4.9105431309904155, + "grad_norm": 0.4325760304927826, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 6148 + }, + { + "epoch": 4.911341853035144, + "grad_norm": 0.5711309313774109, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 6149 + }, + { + "epoch": 4.912140575079873, + "grad_norm": 0.37636998295783997, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 6150 + }, + { + "epoch": 4.9129392971246, + "grad_norm": 0.2788292169570923, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 6151 + }, + { + "epoch": 4.913738019169329, + "grad_norm": 0.31709909439086914, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 6152 + }, + { + "epoch": 4.914536741214057, + "grad_norm": 0.14585916697978973, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6153 + }, + { + "epoch": 4.915335463258786, + "grad_norm": 0.1302923858165741, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 6154 + }, + { + "epoch": 4.916134185303514, + "grad_norm": 0.16156400740146637, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6155 + }, + { + "epoch": 4.916932907348243, + "grad_norm": 0.2323192059993744, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6156 + }, + { + "epoch": 4.917731629392971, + "grad_norm": 0.17504405975341797, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 6157 + }, + { + "epoch": 4.9185303514377, + "grad_norm": 0.07211807370185852, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6158 + }, + { + "epoch": 4.919329073482428, + "grad_norm": 0.26426371932029724, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6159 + }, + { + "epoch": 4.920127795527156, + "grad_norm": 0.237858384847641, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 6160 + }, + { + "epoch": 4.9209265175718855, + "grad_norm": 0.23863473534584045, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6161 + }, + { + "epoch": 4.921725239616613, + "grad_norm": 0.3053814768791199, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6162 + }, + { + "epoch": 4.922523961661342, + "grad_norm": 0.2143447995185852, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6163 + }, + { + "epoch": 4.92332268370607, + "grad_norm": 0.12295633554458618, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 6164 + }, + { + "epoch": 4.924121405750799, + "grad_norm": 0.11128787696361542, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6165 + }, + { + "epoch": 4.924920127795527, + "grad_norm": 0.158652663230896, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 6166 + }, + { + "epoch": 4.925718849840256, + "grad_norm": 0.17612649500370026, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6167 + }, + { + "epoch": 4.926517571884984, + "grad_norm": 0.12243206799030304, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6168 + }, + { + "epoch": 4.927316293929713, + "grad_norm": 0.12234453856945038, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6169 + }, + { + "epoch": 4.928115015974441, + "grad_norm": 0.1968356966972351, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6170 + }, + { + "epoch": 4.928913738019169, + "grad_norm": 0.17286576330661774, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6171 + }, + { + "epoch": 4.9297124600638975, + "grad_norm": 0.0847749337553978, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6172 + }, + { + "epoch": 4.930511182108626, + "grad_norm": 0.0704331174492836, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6173 + }, + { + "epoch": 4.931309904153355, + "grad_norm": 0.12671123445034027, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6174 + }, + { + "epoch": 4.932108626198083, + "grad_norm": 0.10653524100780487, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6175 + }, + { + "epoch": 4.932907348242812, + "grad_norm": 0.0606958381831646, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6176 + }, + { + "epoch": 4.93370607028754, + "grad_norm": 0.12248247116804123, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6177 + }, + { + "epoch": 4.934504792332269, + "grad_norm": 0.1370074301958084, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6178 + }, + { + "epoch": 4.935303514376997, + "grad_norm": 0.05940835922956467, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6179 + }, + { + "epoch": 4.936102236421725, + "grad_norm": 0.1440308690071106, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6180 + }, + { + "epoch": 4.936900958466453, + "grad_norm": 0.1972372829914093, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6181 + }, + { + "epoch": 4.937699680511182, + "grad_norm": 0.10575850307941437, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6182 + }, + { + "epoch": 4.93849840255591, + "grad_norm": 0.11902400851249695, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6183 + }, + { + "epoch": 4.939297124600639, + "grad_norm": 0.15276090800762177, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6184 + }, + { + "epoch": 4.9400958466453675, + "grad_norm": 0.07495112717151642, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6185 + }, + { + "epoch": 4.940894568690096, + "grad_norm": 0.10652542859315872, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6186 + }, + { + "epoch": 4.9416932907348246, + "grad_norm": 0.11347164958715439, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6187 + }, + { + "epoch": 4.942492012779553, + "grad_norm": 0.19946135580539703, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6188 + }, + { + "epoch": 4.943290734824281, + "grad_norm": 0.0771450325846672, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6189 + }, + { + "epoch": 4.944089456869009, + "grad_norm": 0.1086430475115776, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6190 + }, + { + "epoch": 4.944888178913738, + "grad_norm": 0.08790839463472366, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6191 + }, + { + "epoch": 4.945686900958466, + "grad_norm": 0.22063800692558289, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6192 + }, + { + "epoch": 4.946485623003195, + "grad_norm": 0.22287815809249878, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6193 + }, + { + "epoch": 4.947284345047923, + "grad_norm": 1.695265769958496, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6194 + }, + { + "epoch": 4.948083067092652, + "grad_norm": 0.6316840052604675, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 6195 + }, + { + "epoch": 4.94888178913738, + "grad_norm": 0.35637202858924866, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 6196 + }, + { + "epoch": 4.949680511182109, + "grad_norm": 0.2844616174697876, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 6197 + }, + { + "epoch": 4.950479233226837, + "grad_norm": 0.19614022970199585, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 6198 + }, + { + "epoch": 4.951277955271565, + "grad_norm": 0.3665562868118286, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 6199 + }, + { + "epoch": 4.952076677316294, + "grad_norm": 0.1485169231891632, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 6200 + }, + { + "epoch": 4.952875399361022, + "grad_norm": 0.19647273421287537, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6201 + }, + { + "epoch": 4.953674121405751, + "grad_norm": 0.19809085130691528, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6202 + }, + { + "epoch": 4.954472843450479, + "grad_norm": 0.1129874736070633, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6203 + }, + { + "epoch": 4.955271565495208, + "grad_norm": 0.2082832157611847, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6204 + }, + { + "epoch": 4.956070287539936, + "grad_norm": 0.20414425432682037, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6205 + }, + { + "epoch": 4.956869009584665, + "grad_norm": 0.16667422652244568, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6206 + }, + { + "epoch": 4.957667731629393, + "grad_norm": 0.25111839175224304, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 6207 + }, + { + "epoch": 4.958466453674122, + "grad_norm": 0.16995272040367126, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 6208 + }, + { + "epoch": 4.9592651757188495, + "grad_norm": 0.10725044459104538, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6209 + }, + { + "epoch": 4.960063897763578, + "grad_norm": 0.17728300392627716, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6210 + }, + { + "epoch": 4.960862619808307, + "grad_norm": 0.1334110051393509, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6211 + }, + { + "epoch": 4.961661341853035, + "grad_norm": 0.14835794270038605, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6212 + }, + { + "epoch": 4.962460063897764, + "grad_norm": 0.14602027833461761, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6213 + }, + { + "epoch": 4.963258785942492, + "grad_norm": 0.162953719496727, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6214 + }, + { + "epoch": 4.964057507987221, + "grad_norm": 0.7214393615722656, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6215 + }, + { + "epoch": 4.964856230031949, + "grad_norm": 0.27030259370803833, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6216 + }, + { + "epoch": 4.965654952076678, + "grad_norm": 0.18558967113494873, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 6217 + }, + { + "epoch": 4.966453674121405, + "grad_norm": 0.09276804327964783, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6218 + }, + { + "epoch": 4.967252396166134, + "grad_norm": 0.11957832425832748, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6219 + }, + { + "epoch": 4.968051118210862, + "grad_norm": 0.8338447213172913, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 6220 + }, + { + "epoch": 4.968849840255591, + "grad_norm": 0.7283904552459717, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 6221 + }, + { + "epoch": 4.9696485623003195, + "grad_norm": 0.07938430458307266, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6222 + }, + { + "epoch": 4.970447284345048, + "grad_norm": 0.15368770062923431, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6223 + }, + { + "epoch": 4.9712460063897765, + "grad_norm": 0.08823438733816147, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6224 + }, + { + "epoch": 4.972044728434505, + "grad_norm": 0.07656054943799973, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6225 + }, + { + "epoch": 4.972843450479234, + "grad_norm": 0.08777901530265808, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6226 + }, + { + "epoch": 4.973642172523961, + "grad_norm": 0.09863653033971786, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6227 + }, + { + "epoch": 4.97444089456869, + "grad_norm": 0.13259904086589813, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6228 + }, + { + "epoch": 4.975239616613418, + "grad_norm": 0.08148759603500366, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6229 + }, + { + "epoch": 4.976038338658147, + "grad_norm": 0.06982999294996262, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6230 + }, + { + "epoch": 4.976837060702875, + "grad_norm": 0.09279565513134003, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6231 + }, + { + "epoch": 4.977635782747604, + "grad_norm": 0.05821947008371353, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6232 + }, + { + "epoch": 4.978434504792332, + "grad_norm": 0.07475738972425461, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6233 + }, + { + "epoch": 4.979233226837061, + "grad_norm": 0.10464147478342056, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6234 + }, + { + "epoch": 4.9800319488817895, + "grad_norm": 0.08045687526464462, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6235 + }, + { + "epoch": 4.980830670926517, + "grad_norm": 0.08045300841331482, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6236 + }, + { + "epoch": 4.981629392971246, + "grad_norm": 0.10313838720321655, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6237 + }, + { + "epoch": 4.982428115015974, + "grad_norm": 0.08065208047628403, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6238 + }, + { + "epoch": 4.983226837060703, + "grad_norm": 0.0807032585144043, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6239 + }, + { + "epoch": 4.984025559105431, + "grad_norm": 0.06274307519197464, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6240 + }, + { + "epoch": 4.98482428115016, + "grad_norm": 0.07299554347991943, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6241 + }, + { + "epoch": 4.985623003194888, + "grad_norm": 0.0592481754720211, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6242 + }, + { + "epoch": 4.986421725239617, + "grad_norm": 0.0766056478023529, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6243 + }, + { + "epoch": 4.987220447284345, + "grad_norm": 0.07707066088914871, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6244 + }, + { + "epoch": 4.988019169329074, + "grad_norm": 0.7231665849685669, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6245 + }, + { + "epoch": 4.988817891373802, + "grad_norm": 0.0678652748465538, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6246 + }, + { + "epoch": 4.98961661341853, + "grad_norm": 3.667872905731201, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6247 + }, + { + "epoch": 4.9904153354632586, + "grad_norm": 0.2416938990354538, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6248 + }, + { + "epoch": 4.991214057507987, + "grad_norm": 0.27054834365844727, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6249 + }, + { + "epoch": 4.992012779552716, + "grad_norm": 0.1435888707637787, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6250 + }, + { + "epoch": 4.992811501597444, + "grad_norm": 0.1542683094739914, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6251 + }, + { + "epoch": 4.993610223642173, + "grad_norm": 0.1867702603340149, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6252 + }, + { + "epoch": 4.994408945686901, + "grad_norm": 0.09558507800102234, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6253 + }, + { + "epoch": 4.99520766773163, + "grad_norm": 0.3019699156284332, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6254 + }, + { + "epoch": 4.996006389776358, + "grad_norm": 0.11987117677927017, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6255 + }, + { + "epoch": 4.996805111821086, + "grad_norm": 0.11792664974927902, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6256 + }, + { + "epoch": 4.997603833865814, + "grad_norm": 0.15580247342586517, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6257 + }, + { + "epoch": 4.998402555910543, + "grad_norm": 0.20167642831802368, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6258 + }, + { + "epoch": 4.9992012779552715, + "grad_norm": 0.11203871667385101, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6259 + }, + { + "epoch": 5.0, + "grad_norm": 0.11081275343894958, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6260 + }, + { + "epoch": 5.0007987220447285, + "grad_norm": 0.11213719099760056, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6261 + }, + { + "epoch": 5.001597444089457, + "grad_norm": 0.11074960231781006, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6262 + }, + { + "epoch": 5.002396166134186, + "grad_norm": 0.07538039237260818, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6263 + }, + { + "epoch": 5.003194888178914, + "grad_norm": 0.0824185386300087, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6264 + }, + { + "epoch": 5.003993610223642, + "grad_norm": 0.08940225094556808, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6265 + }, + { + "epoch": 5.00479233226837, + "grad_norm": 0.07072590291500092, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6266 + }, + { + "epoch": 5.005591054313099, + "grad_norm": 0.13027220964431763, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6267 + }, + { + "epoch": 5.006389776357827, + "grad_norm": 0.09226793050765991, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6268 + }, + { + "epoch": 5.007188498402556, + "grad_norm": 0.1879013329744339, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6269 + }, + { + "epoch": 5.007987220447284, + "grad_norm": 0.09063144028186798, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6270 + }, + { + "epoch": 5.008785942492013, + "grad_norm": 0.09013621509075165, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6271 + }, + { + "epoch": 5.0095846645367414, + "grad_norm": 0.2404542863368988, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6272 + }, + { + "epoch": 5.01038338658147, + "grad_norm": 0.11968059092760086, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6273 + }, + { + "epoch": 5.0111821086261985, + "grad_norm": 0.16429072618484497, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6274 + }, + { + "epoch": 5.011980830670926, + "grad_norm": 0.08745420724153519, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6275 + }, + { + "epoch": 5.012779552715655, + "grad_norm": 0.09130390733480453, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6276 + }, + { + "epoch": 5.013578274760383, + "grad_norm": 0.06996344774961472, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6277 + }, + { + "epoch": 5.014376996805112, + "grad_norm": 0.06063826382160187, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6278 + }, + { + "epoch": 5.01517571884984, + "grad_norm": 0.14752542972564697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6279 + }, + { + "epoch": 5.015974440894569, + "grad_norm": 0.05987429618835449, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6280 + }, + { + "epoch": 5.016773162939297, + "grad_norm": 0.1716211587190628, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6281 + }, + { + "epoch": 5.017571884984026, + "grad_norm": 0.13823190331459045, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6282 + }, + { + "epoch": 5.018370607028754, + "grad_norm": 0.09764201194047928, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6283 + }, + { + "epoch": 5.019169329073482, + "grad_norm": 0.07897874712944031, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6284 + }, + { + "epoch": 5.0199680511182105, + "grad_norm": 0.07823392748832703, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6285 + }, + { + "epoch": 5.020766773162939, + "grad_norm": 0.1033136323094368, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6286 + }, + { + "epoch": 5.021565495207668, + "grad_norm": 0.07100827991962433, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6287 + }, + { + "epoch": 5.022364217252396, + "grad_norm": 0.40211987495422363, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 6288 + }, + { + "epoch": 5.023162939297125, + "grad_norm": 0.15459896624088287, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6289 + }, + { + "epoch": 5.023961661341853, + "grad_norm": 0.07789050787687302, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6290 + }, + { + "epoch": 5.024760383386582, + "grad_norm": 0.2116134762763977, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6291 + }, + { + "epoch": 5.02555910543131, + "grad_norm": 0.1842123568058014, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6292 + }, + { + "epoch": 5.026357827476039, + "grad_norm": 0.2037680447101593, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6293 + }, + { + "epoch": 5.027156549520766, + "grad_norm": 0.10851238667964935, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6294 + }, + { + "epoch": 5.027955271565495, + "grad_norm": 0.14465196430683136, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6295 + }, + { + "epoch": 5.0287539936102235, + "grad_norm": 0.11993128806352615, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6296 + }, + { + "epoch": 5.029552715654952, + "grad_norm": 0.13647349178791046, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6297 + }, + { + "epoch": 5.0303514376996805, + "grad_norm": 0.11265698075294495, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 6298 + }, + { + "epoch": 5.031150159744409, + "grad_norm": 18.601808547973633, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6299 + }, + { + "epoch": 5.031948881789138, + "grad_norm": 0.40079689025878906, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6300 + }, + { + "epoch": 5.032747603833866, + "grad_norm": 3.513967752456665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6301 + }, + { + "epoch": 5.033546325878595, + "grad_norm": 24.040191650390625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6302 + }, + { + "epoch": 5.034345047923322, + "grad_norm": 0.7786405086517334, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6303 + }, + { + "epoch": 5.035143769968051, + "grad_norm": 0.619868814945221, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6304 + }, + { + "epoch": 5.035942492012779, + "grad_norm": 6.039219379425049, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6305 + }, + { + "epoch": 5.036741214057508, + "grad_norm": 23.90920639038086, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 6306 + }, + { + "epoch": 5.037539936102236, + "grad_norm": 1.296809196472168, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 6307 + }, + { + "epoch": 5.038338658146965, + "grad_norm": 0.7673514485359192, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 6308 + }, + { + "epoch": 5.039137380191693, + "grad_norm": 0.5065979957580566, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 6309 + }, + { + "epoch": 5.039936102236422, + "grad_norm": 0.3858639597892761, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 6310 + }, + { + "epoch": 5.0407348242811505, + "grad_norm": 0.2647075653076172, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 6311 + }, + { + "epoch": 5.041533546325879, + "grad_norm": 0.2713094651699066, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 6312 + }, + { + "epoch": 5.042332268370607, + "grad_norm": 0.2573802173137665, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 6313 + }, + { + "epoch": 5.043130990415335, + "grad_norm": 0.2083175778388977, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 6314 + }, + { + "epoch": 5.043929712460064, + "grad_norm": 0.3625626564025879, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 6315 + }, + { + "epoch": 5.044728434504792, + "grad_norm": 0.331129789352417, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 6316 + }, + { + "epoch": 5.045527156549521, + "grad_norm": 0.23352555930614471, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 6317 + }, + { + "epoch": 5.046325878594249, + "grad_norm": 0.24043256044387817, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 6318 + }, + { + "epoch": 5.047124600638978, + "grad_norm": 0.31510207056999207, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 6319 + }, + { + "epoch": 5.047923322683706, + "grad_norm": 0.6896952390670776, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 6320 + }, + { + "epoch": 5.048722044728435, + "grad_norm": 0.7915457487106323, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 6321 + }, + { + "epoch": 5.0495207667731625, + "grad_norm": 0.2959117889404297, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 6322 + }, + { + "epoch": 5.050319488817891, + "grad_norm": 0.44844529032707214, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 6323 + }, + { + "epoch": 5.05111821086262, + "grad_norm": 0.3385697305202484, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 6324 + }, + { + "epoch": 5.051916932907348, + "grad_norm": 0.31220802664756775, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 6325 + }, + { + "epoch": 5.052715654952077, + "grad_norm": 0.3420731723308563, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 6326 + }, + { + "epoch": 5.053514376996805, + "grad_norm": 0.3061322569847107, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 6327 + }, + { + "epoch": 5.054313099041534, + "grad_norm": 0.6878030300140381, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 6328 + }, + { + "epoch": 5.055111821086262, + "grad_norm": 0.1927136927843094, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 6329 + }, + { + "epoch": 5.055910543130991, + "grad_norm": 0.24812163412570953, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6330 + }, + { + "epoch": 5.056709265175719, + "grad_norm": 0.19675321877002716, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6331 + }, + { + "epoch": 5.057507987220447, + "grad_norm": 0.20720984041690826, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6332 + }, + { + "epoch": 5.0583067092651754, + "grad_norm": 0.1260477900505066, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6333 + }, + { + "epoch": 5.059105431309904, + "grad_norm": 0.24399158358573914, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6334 + }, + { + "epoch": 5.0599041533546325, + "grad_norm": 0.22406993806362152, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6335 + }, + { + "epoch": 5.060702875399361, + "grad_norm": 0.24807684123516083, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6336 + }, + { + "epoch": 5.06150159744409, + "grad_norm": 0.1272616684436798, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6337 + }, + { + "epoch": 5.062300319488818, + "grad_norm": 0.2053418755531311, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6338 + }, + { + "epoch": 5.063099041533547, + "grad_norm": 0.13628287613391876, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6339 + }, + { + "epoch": 5.063897763578275, + "grad_norm": 0.21262522041797638, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6340 + }, + { + "epoch": 5.064696485623003, + "grad_norm": 0.3784351646900177, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6341 + }, + { + "epoch": 5.065495207667731, + "grad_norm": 0.3282131552696228, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6342 + }, + { + "epoch": 5.06629392971246, + "grad_norm": 0.10128312557935715, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6343 + }, + { + "epoch": 5.067092651757188, + "grad_norm": 0.2297000139951706, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6344 + }, + { + "epoch": 5.067891373801917, + "grad_norm": 0.11327458173036575, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6345 + }, + { + "epoch": 5.068690095846645, + "grad_norm": 0.16150346398353577, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6346 + }, + { + "epoch": 5.069488817891374, + "grad_norm": 0.15486986935138702, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6347 + }, + { + "epoch": 5.0702875399361025, + "grad_norm": 0.12427826225757599, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6348 + }, + { + "epoch": 5.071086261980831, + "grad_norm": 0.11321424692869186, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6349 + }, + { + "epoch": 5.0718849840255595, + "grad_norm": 0.12668851017951965, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6350 + }, + { + "epoch": 5.072683706070287, + "grad_norm": 0.20059579610824585, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6351 + }, + { + "epoch": 5.073482428115016, + "grad_norm": 0.14591605961322784, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6352 + }, + { + "epoch": 5.074281150159744, + "grad_norm": 0.19168664515018463, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6353 + }, + { + "epoch": 5.075079872204473, + "grad_norm": 0.19381079077720642, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6354 + }, + { + "epoch": 5.075878594249201, + "grad_norm": 0.0957496389746666, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6355 + }, + { + "epoch": 5.07667731629393, + "grad_norm": 0.11414145678281784, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6356 + }, + { + "epoch": 5.077476038338658, + "grad_norm": 0.10855124145746231, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6357 + }, + { + "epoch": 5.078274760383387, + "grad_norm": 0.2300068736076355, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6358 + }, + { + "epoch": 5.079073482428115, + "grad_norm": 0.15098270773887634, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 6359 + }, + { + "epoch": 5.079872204472843, + "grad_norm": 0.09821227937936783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6360 + }, + { + "epoch": 5.080670926517572, + "grad_norm": 0.135583758354187, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6361 + }, + { + "epoch": 5.0814696485623, + "grad_norm": 0.07262608408927917, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6362 + }, + { + "epoch": 5.082268370607029, + "grad_norm": 0.10731761902570724, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6363 + }, + { + "epoch": 5.083067092651757, + "grad_norm": 0.27508556842803955, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6364 + }, + { + "epoch": 5.083865814696486, + "grad_norm": 0.12996995449066162, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6365 + }, + { + "epoch": 5.084664536741214, + "grad_norm": 0.10386788845062256, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6366 + }, + { + "epoch": 5.085463258785943, + "grad_norm": 0.07591816782951355, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6367 + }, + { + "epoch": 5.086261980830671, + "grad_norm": 0.09341761469841003, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6368 + }, + { + "epoch": 5.0870607028754, + "grad_norm": 0.12575088441371918, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6369 + }, + { + "epoch": 5.087859424920127, + "grad_norm": 0.3423956036567688, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 6370 + }, + { + "epoch": 5.088658146964856, + "grad_norm": 0.2154775857925415, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6371 + }, + { + "epoch": 5.0894568690095845, + "grad_norm": 0.1550479382276535, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6372 + }, + { + "epoch": 5.090255591054313, + "grad_norm": 0.08802525699138641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6373 + }, + { + "epoch": 5.0910543130990416, + "grad_norm": 0.08421735465526581, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6374 + }, + { + "epoch": 5.09185303514377, + "grad_norm": 0.08920808881521225, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6375 + }, + { + "epoch": 5.092651757188499, + "grad_norm": 0.1450507938861847, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6376 + }, + { + "epoch": 5.093450479233227, + "grad_norm": 0.16926947236061096, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 6377 + }, + { + "epoch": 5.094249201277956, + "grad_norm": 0.6995428204536438, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6378 + }, + { + "epoch": 5.095047923322683, + "grad_norm": 0.10353969782590866, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6379 + }, + { + "epoch": 5.095846645367412, + "grad_norm": 0.09132180362939835, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6380 + }, + { + "epoch": 5.09664536741214, + "grad_norm": 0.17745476961135864, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6381 + }, + { + "epoch": 5.097444089456869, + "grad_norm": 0.10596930980682373, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6382 + }, + { + "epoch": 5.098242811501597, + "grad_norm": 0.11676348745822906, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6383 + }, + { + "epoch": 5.099041533546326, + "grad_norm": 0.13022664189338684, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6384 + }, + { + "epoch": 5.0998402555910545, + "grad_norm": 0.11169753223657608, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6385 + }, + { + "epoch": 5.100638977635783, + "grad_norm": 0.07439867407083511, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6386 + }, + { + "epoch": 5.1014376996805115, + "grad_norm": 0.06953777372837067, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6387 + }, + { + "epoch": 5.102236421725239, + "grad_norm": 0.09419669955968857, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6388 + }, + { + "epoch": 5.103035143769968, + "grad_norm": 0.1166587546467781, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6389 + }, + { + "epoch": 5.103833865814696, + "grad_norm": 0.5776185393333435, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6390 + }, + { + "epoch": 5.104632587859425, + "grad_norm": 0.13175810873508453, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6391 + }, + { + "epoch": 5.105431309904153, + "grad_norm": 0.09372890740633011, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6392 + }, + { + "epoch": 5.106230031948882, + "grad_norm": 0.25262513756752014, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6393 + }, + { + "epoch": 5.10702875399361, + "grad_norm": 0.1348644196987152, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6394 + }, + { + "epoch": 5.107827476038339, + "grad_norm": 0.23879335820674896, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6395 + }, + { + "epoch": 5.108626198083067, + "grad_norm": 0.25561729073524475, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6396 + }, + { + "epoch": 5.109424920127796, + "grad_norm": 0.26974916458129883, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6397 + }, + { + "epoch": 5.110223642172524, + "grad_norm": 0.1866329163312912, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6398 + }, + { + "epoch": 5.111022364217252, + "grad_norm": 0.22104737162590027, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6399 + }, + { + "epoch": 5.111821086261981, + "grad_norm": 0.3775753676891327, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6400 + }, + { + "epoch": 5.112619808306709, + "grad_norm": 0.20636002719402313, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6401 + }, + { + "epoch": 5.113418530351438, + "grad_norm": 0.1941772699356079, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6402 + }, + { + "epoch": 5.114217252396166, + "grad_norm": 0.14595480263233185, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6403 + }, + { + "epoch": 5.115015974440895, + "grad_norm": 0.16794493794441223, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6404 + }, + { + "epoch": 5.115814696485623, + "grad_norm": 0.16466112434864044, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6405 + }, + { + "epoch": 5.116613418530352, + "grad_norm": 0.27192312479019165, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6406 + }, + { + "epoch": 5.11741214057508, + "grad_norm": 0.296017050743103, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6407 + }, + { + "epoch": 5.118210862619808, + "grad_norm": 0.24947655200958252, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6408 + }, + { + "epoch": 5.1190095846645365, + "grad_norm": 0.07843278348445892, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6409 + }, + { + "epoch": 5.119808306709265, + "grad_norm": 0.2507891356945038, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6410 + }, + { + "epoch": 5.1206070287539935, + "grad_norm": 0.2962022125720978, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6411 + }, + { + "epoch": 5.121405750798722, + "grad_norm": 0.21588601171970367, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6412 + }, + { + "epoch": 5.122204472843451, + "grad_norm": 0.27223092317581177, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6413 + }, + { + "epoch": 5.123003194888179, + "grad_norm": 0.1475650519132614, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6414 + }, + { + "epoch": 5.123801916932908, + "grad_norm": 0.2624805271625519, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6415 + }, + { + "epoch": 5.124600638977636, + "grad_norm": 0.27691081166267395, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6416 + }, + { + "epoch": 5.125399361022364, + "grad_norm": 0.1828494369983673, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6417 + }, + { + "epoch": 5.126198083067092, + "grad_norm": 0.27542614936828613, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 6418 + }, + { + "epoch": 5.126996805111821, + "grad_norm": 0.16250371932983398, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6419 + }, + { + "epoch": 5.127795527156549, + "grad_norm": 0.17180733382701874, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6420 + }, + { + "epoch": 5.128594249201278, + "grad_norm": 0.21466004848480225, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6421 + }, + { + "epoch": 5.1293929712460065, + "grad_norm": 0.13144539296627045, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6422 + }, + { + "epoch": 5.130191693290735, + "grad_norm": 0.158688023686409, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6423 + }, + { + "epoch": 5.1309904153354635, + "grad_norm": 0.1430175006389618, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6424 + }, + { + "epoch": 5.131789137380192, + "grad_norm": 0.0988554134964943, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6425 + }, + { + "epoch": 5.13258785942492, + "grad_norm": 0.18320757150650024, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6426 + }, + { + "epoch": 5.133386581469648, + "grad_norm": 0.34172165393829346, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6427 + }, + { + "epoch": 5.134185303514377, + "grad_norm": 0.095450758934021, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6428 + }, + { + "epoch": 5.134984025559105, + "grad_norm": 0.2988479733467102, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6429 + }, + { + "epoch": 5.135782747603834, + "grad_norm": 0.11462085694074631, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6430 + }, + { + "epoch": 5.136581469648562, + "grad_norm": 0.11989153176546097, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6431 + }, + { + "epoch": 5.137380191693291, + "grad_norm": 0.15308552980422974, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6432 + }, + { + "epoch": 5.138178913738019, + "grad_norm": 0.1119944304227829, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6433 + }, + { + "epoch": 5.138977635782748, + "grad_norm": 0.38812172412872314, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6434 + }, + { + "epoch": 5.139776357827476, + "grad_norm": 0.24718649685382843, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6435 + }, + { + "epoch": 5.140575079872204, + "grad_norm": 0.15834778547286987, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6436 + }, + { + "epoch": 5.141373801916933, + "grad_norm": 0.1960451751947403, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6437 + }, + { + "epoch": 5.142172523961661, + "grad_norm": 0.16195416450500488, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6438 + }, + { + "epoch": 5.14297124600639, + "grad_norm": 0.07554367184638977, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6439 + }, + { + "epoch": 5.143769968051118, + "grad_norm": 0.18924687802791595, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6440 + }, + { + "epoch": 5.144568690095847, + "grad_norm": 0.16253480315208435, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6441 + }, + { + "epoch": 5.145367412140575, + "grad_norm": 0.12711918354034424, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6442 + }, + { + "epoch": 5.146166134185304, + "grad_norm": 0.16831086575984955, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6443 + }, + { + "epoch": 5.146964856230032, + "grad_norm": 0.35199087858200073, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6444 + }, + { + "epoch": 5.147763578274761, + "grad_norm": 0.1340232491493225, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6445 + }, + { + "epoch": 5.1485623003194885, + "grad_norm": 0.1397274285554886, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6446 + }, + { + "epoch": 5.149361022364217, + "grad_norm": 0.13868366181850433, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6447 + }, + { + "epoch": 5.1501597444089455, + "grad_norm": 0.08846192806959152, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6448 + }, + { + "epoch": 5.150958466453674, + "grad_norm": 0.08350610733032227, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6449 + }, + { + "epoch": 5.151757188498403, + "grad_norm": 0.14727875590324402, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6450 + }, + { + "epoch": 5.152555910543131, + "grad_norm": 0.11705708503723145, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6451 + }, + { + "epoch": 5.15335463258786, + "grad_norm": 0.10308192670345306, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6452 + }, + { + "epoch": 5.154153354632588, + "grad_norm": 0.09459209442138672, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6453 + }, + { + "epoch": 5.154952076677317, + "grad_norm": 0.11605191230773926, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6454 + }, + { + "epoch": 5.155750798722044, + "grad_norm": 0.24275821447372437, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6455 + }, + { + "epoch": 5.156549520766773, + "grad_norm": 0.208640456199646, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6456 + }, + { + "epoch": 5.157348242811501, + "grad_norm": 0.15257662534713745, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6457 + }, + { + "epoch": 5.15814696485623, + "grad_norm": 0.10431355237960815, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6458 + }, + { + "epoch": 5.1589456869009584, + "grad_norm": 0.14187589287757874, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6459 + }, + { + "epoch": 5.159744408945687, + "grad_norm": 0.19084404408931732, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6460 + }, + { + "epoch": 5.1605431309904155, + "grad_norm": 0.09255128353834152, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6461 + }, + { + "epoch": 5.161341853035144, + "grad_norm": 0.1443471759557724, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6462 + }, + { + "epoch": 5.162140575079873, + "grad_norm": 0.36597245931625366, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6463 + }, + { + "epoch": 5.1629392971246, + "grad_norm": 0.3835389316082001, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6464 + }, + { + "epoch": 5.163738019169329, + "grad_norm": 0.14208771288394928, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6465 + }, + { + "epoch": 5.164536741214057, + "grad_norm": 0.2520706355571747, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6466 + }, + { + "epoch": 5.165335463258786, + "grad_norm": 0.2595224976539612, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6467 + }, + { + "epoch": 5.166134185303514, + "grad_norm": 0.15721063315868378, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6468 + }, + { + "epoch": 5.166932907348243, + "grad_norm": 0.1772007793188095, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6469 + }, + { + "epoch": 5.167731629392971, + "grad_norm": 0.19899888336658478, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6470 + }, + { + "epoch": 5.1685303514377, + "grad_norm": 0.18689346313476562, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6471 + }, + { + "epoch": 5.169329073482428, + "grad_norm": 0.16748468577861786, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6472 + }, + { + "epoch": 5.170127795527157, + "grad_norm": 0.13296879827976227, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6473 + }, + { + "epoch": 5.170926517571885, + "grad_norm": 0.18742166459560394, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6474 + }, + { + "epoch": 5.171725239616613, + "grad_norm": 0.17811308801174164, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6475 + }, + { + "epoch": 5.172523961661342, + "grad_norm": 0.1360485702753067, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 6476 + }, + { + "epoch": 5.17332268370607, + "grad_norm": 0.13431121408939362, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6477 + }, + { + "epoch": 5.174121405750799, + "grad_norm": 0.12888069450855255, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6478 + }, + { + "epoch": 5.174920127795527, + "grad_norm": 0.15194712579250336, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6479 + }, + { + "epoch": 5.175718849840256, + "grad_norm": 0.13076889514923096, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6480 + }, + { + "epoch": 5.176517571884984, + "grad_norm": 0.14751110970973969, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6481 + }, + { + "epoch": 5.177316293929713, + "grad_norm": 0.11919333785772324, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6482 + }, + { + "epoch": 5.178115015974441, + "grad_norm": 0.12712688744068146, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6483 + }, + { + "epoch": 5.178913738019169, + "grad_norm": 0.13765369355678558, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 6484 + }, + { + "epoch": 5.1797124600638975, + "grad_norm": 0.11060373485088348, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6485 + }, + { + "epoch": 5.180511182108626, + "grad_norm": 0.056882213801145554, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6486 + }, + { + "epoch": 5.181309904153355, + "grad_norm": 0.11317770928144455, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6487 + }, + { + "epoch": 5.182108626198083, + "grad_norm": 0.09279809147119522, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6488 + }, + { + "epoch": 5.182907348242812, + "grad_norm": 0.09392786771059036, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6489 + }, + { + "epoch": 5.18370607028754, + "grad_norm": 0.13042815029621124, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6490 + }, + { + "epoch": 5.184504792332269, + "grad_norm": 0.07929978519678116, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6491 + }, + { + "epoch": 5.185303514376997, + "grad_norm": 0.12215851992368698, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6492 + }, + { + "epoch": 5.186102236421725, + "grad_norm": 0.12000773102045059, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6493 + }, + { + "epoch": 5.186900958466453, + "grad_norm": 0.08427707850933075, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6494 + }, + { + "epoch": 5.187699680511182, + "grad_norm": 0.158653125166893, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6495 + }, + { + "epoch": 5.18849840255591, + "grad_norm": 0.11087878793478012, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6496 + }, + { + "epoch": 5.189297124600639, + "grad_norm": 0.12649668753147125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6497 + }, + { + "epoch": 5.1900958466453675, + "grad_norm": 0.0821281224489212, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6498 + }, + { + "epoch": 5.190894568690096, + "grad_norm": 0.07192671298980713, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6499 + }, + { + "epoch": 5.1916932907348246, + "grad_norm": 0.10505214333534241, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6500 + }, + { + "epoch": 5.192492012779553, + "grad_norm": 0.11772353947162628, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6501 + }, + { + "epoch": 5.193290734824281, + "grad_norm": 0.15557901561260223, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6502 + }, + { + "epoch": 5.194089456869009, + "grad_norm": 0.09753020852804184, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6503 + }, + { + "epoch": 5.194888178913738, + "grad_norm": 0.10331830382347107, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6504 + }, + { + "epoch": 5.195686900958466, + "grad_norm": 0.130085289478302, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6505 + }, + { + "epoch": 5.196485623003195, + "grad_norm": 0.08772018551826477, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6506 + }, + { + "epoch": 5.197284345047923, + "grad_norm": 0.1906667798757553, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6507 + }, + { + "epoch": 5.198083067092652, + "grad_norm": 0.06724394112825394, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6508 + }, + { + "epoch": 5.19888178913738, + "grad_norm": 0.1141325905919075, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6509 + }, + { + "epoch": 5.199680511182109, + "grad_norm": 0.08354665338993073, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6510 + }, + { + "epoch": 5.2004792332268375, + "grad_norm": 0.1072440817952156, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6511 + }, + { + "epoch": 5.201277955271565, + "grad_norm": 0.10670839250087738, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6512 + }, + { + "epoch": 5.202076677316294, + "grad_norm": 0.10079781711101532, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6513 + }, + { + "epoch": 5.202875399361022, + "grad_norm": 0.1281125396490097, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6514 + }, + { + "epoch": 5.203674121405751, + "grad_norm": 0.1627720147371292, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6515 + }, + { + "epoch": 5.204472843450479, + "grad_norm": 0.1507575958967209, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6516 + }, + { + "epoch": 5.205271565495208, + "grad_norm": 0.17764779925346375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6517 + }, + { + "epoch": 5.206070287539936, + "grad_norm": 0.1825307011604309, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6518 + }, + { + "epoch": 5.206869009584665, + "grad_norm": 0.1151907742023468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6519 + }, + { + "epoch": 5.207667731629393, + "grad_norm": 0.1425708830356598, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6520 + }, + { + "epoch": 5.208466453674121, + "grad_norm": 0.08555550873279572, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6521 + }, + { + "epoch": 5.2092651757188495, + "grad_norm": 0.15400084853172302, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6522 + }, + { + "epoch": 5.210063897763578, + "grad_norm": 0.11088921129703522, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6523 + }, + { + "epoch": 5.210862619808307, + "grad_norm": 0.0959518551826477, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6524 + }, + { + "epoch": 5.211661341853035, + "grad_norm": 0.1054866686463356, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6525 + }, + { + "epoch": 5.212460063897764, + "grad_norm": 0.17849107086658478, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6526 + }, + { + "epoch": 5.213258785942492, + "grad_norm": 0.0910423994064331, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6527 + }, + { + "epoch": 5.214057507987221, + "grad_norm": 0.10857872664928436, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6528 + }, + { + "epoch": 5.214856230031949, + "grad_norm": 0.09012399613857269, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6529 + }, + { + "epoch": 5.215654952076678, + "grad_norm": 0.14724178612232208, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6530 + }, + { + "epoch": 5.216453674121405, + "grad_norm": 0.11357409507036209, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6531 + }, + { + "epoch": 5.217252396166134, + "grad_norm": 0.09721364825963974, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6532 + }, + { + "epoch": 5.218051118210862, + "grad_norm": 0.07837430387735367, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6533 + }, + { + "epoch": 5.218849840255591, + "grad_norm": 0.1181735098361969, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6534 + }, + { + "epoch": 5.2196485623003195, + "grad_norm": 0.07066017389297485, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6535 + }, + { + "epoch": 5.220447284345048, + "grad_norm": 0.06838417053222656, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6536 + }, + { + "epoch": 5.2212460063897765, + "grad_norm": 0.0919245257973671, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6537 + }, + { + "epoch": 5.222044728434505, + "grad_norm": 0.06859984248876572, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6538 + }, + { + "epoch": 5.222843450479234, + "grad_norm": 1.929213523864746, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6539 + }, + { + "epoch": 5.223642172523961, + "grad_norm": 0.11181562393903732, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6540 + }, + { + "epoch": 5.22444089456869, + "grad_norm": 0.09261998534202576, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6541 + }, + { + "epoch": 5.225239616613418, + "grad_norm": 0.11214403063058853, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6542 + }, + { + "epoch": 5.226038338658147, + "grad_norm": 0.1353820264339447, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6543 + }, + { + "epoch": 5.226837060702875, + "grad_norm": 0.11579953879117966, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 6544 + }, + { + "epoch": 5.227635782747604, + "grad_norm": 0.08284885436296463, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6545 + }, + { + "epoch": 5.228434504792332, + "grad_norm": 0.13805733621120453, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6546 + }, + { + "epoch": 5.229233226837061, + "grad_norm": 0.08924185484647751, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6547 + }, + { + "epoch": 5.2300319488817895, + "grad_norm": 0.10975285619497299, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6548 + }, + { + "epoch": 5.230830670926518, + "grad_norm": 0.10500271618366241, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6549 + }, + { + "epoch": 5.231629392971246, + "grad_norm": 0.09947814792394638, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 5.232428115015974, + "grad_norm": 0.10113594681024551, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6551 + }, + { + "epoch": 5.233226837060703, + "grad_norm": 0.12645265460014343, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6552 + }, + { + "epoch": 5.234025559105431, + "grad_norm": 0.06775741279125214, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6553 + }, + { + "epoch": 5.23482428115016, + "grad_norm": 0.09799529612064362, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6554 + }, + { + "epoch": 5.235623003194888, + "grad_norm": 0.13129538297653198, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6555 + }, + { + "epoch": 5.236421725239617, + "grad_norm": 0.10139735788106918, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6556 + }, + { + "epoch": 5.237220447284345, + "grad_norm": 0.13819058239459991, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6557 + }, + { + "epoch": 5.238019169329074, + "grad_norm": 0.09306512027978897, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6558 + }, + { + "epoch": 5.2388178913738015, + "grad_norm": 0.07963602244853973, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6559 + }, + { + "epoch": 5.23961661341853, + "grad_norm": 0.12864448130130768, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6560 + }, + { + "epoch": 5.2404153354632586, + "grad_norm": 0.1044403612613678, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6561 + }, + { + "epoch": 5.241214057507987, + "grad_norm": 0.07623843848705292, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6562 + }, + { + "epoch": 5.242012779552716, + "grad_norm": 0.10385097563266754, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6563 + }, + { + "epoch": 5.242811501597444, + "grad_norm": 0.07048188149929047, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6564 + }, + { + "epoch": 5.243610223642173, + "grad_norm": 0.25789955258369446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6565 + }, + { + "epoch": 5.244408945686901, + "grad_norm": 0.12271685153245926, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6566 + }, + { + "epoch": 5.24520766773163, + "grad_norm": 0.10512058436870575, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6567 + }, + { + "epoch": 5.246006389776358, + "grad_norm": 0.07663438469171524, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6568 + }, + { + "epoch": 5.246805111821086, + "grad_norm": 0.09937599301338196, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6569 + }, + { + "epoch": 5.247603833865814, + "grad_norm": 0.12242338061332703, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6570 + }, + { + "epoch": 5.248402555910543, + "grad_norm": 0.1733475625514984, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6571 + }, + { + "epoch": 5.2492012779552715, + "grad_norm": 0.1460944414138794, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6572 + }, + { + "epoch": 5.25, + "grad_norm": 0.09406521171331406, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6573 + }, + { + "epoch": 5.2507987220447285, + "grad_norm": 1.0146688222885132, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6574 + }, + { + "epoch": 5.251597444089457, + "grad_norm": 0.10557705909013748, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6575 + }, + { + "epoch": 5.252396166134186, + "grad_norm": 0.1306990385055542, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6576 + }, + { + "epoch": 5.253194888178914, + "grad_norm": 0.094961017370224, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6577 + }, + { + "epoch": 5.253993610223642, + "grad_norm": 0.13421863317489624, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6578 + }, + { + "epoch": 5.25479233226837, + "grad_norm": 0.12371776252985, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6579 + }, + { + "epoch": 5.255591054313099, + "grad_norm": 0.15863509476184845, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6580 + }, + { + "epoch": 5.256389776357827, + "grad_norm": 0.1156599149107933, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6581 + }, + { + "epoch": 5.257188498402556, + "grad_norm": 0.07102219015359879, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6582 + }, + { + "epoch": 5.257987220447284, + "grad_norm": 0.09030039608478546, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6583 + }, + { + "epoch": 5.258785942492013, + "grad_norm": 0.08848102390766144, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6584 + }, + { + "epoch": 5.2595846645367414, + "grad_norm": 0.07455430924892426, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6585 + }, + { + "epoch": 5.26038338658147, + "grad_norm": 0.07729559391736984, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6586 + }, + { + "epoch": 5.261182108626198, + "grad_norm": 0.0955357626080513, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6587 + }, + { + "epoch": 5.261980830670926, + "grad_norm": 0.08680911362171173, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6588 + }, + { + "epoch": 5.262779552715655, + "grad_norm": 0.1033414825797081, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6589 + }, + { + "epoch": 5.263578274760383, + "grad_norm": 0.09428979456424713, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6590 + }, + { + "epoch": 5.264376996805112, + "grad_norm": 0.07567942887544632, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6591 + }, + { + "epoch": 5.26517571884984, + "grad_norm": 0.221647247672081, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6592 + }, + { + "epoch": 5.265974440894569, + "grad_norm": 0.13839758932590485, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6593 + }, + { + "epoch": 5.266773162939297, + "grad_norm": 0.06060291454195976, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6594 + }, + { + "epoch": 5.267571884984026, + "grad_norm": 0.09146185964345932, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6595 + }, + { + "epoch": 5.268370607028754, + "grad_norm": 0.05557526275515556, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6596 + }, + { + "epoch": 5.269169329073483, + "grad_norm": 0.10190495103597641, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6597 + }, + { + "epoch": 5.2699680511182105, + "grad_norm": 0.07389659434556961, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6598 + }, + { + "epoch": 5.270766773162939, + "grad_norm": 0.11124115437269211, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6599 + }, + { + "epoch": 5.271565495207668, + "grad_norm": 0.10779515653848648, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6600 + }, + { + "epoch": 5.272364217252396, + "grad_norm": 0.09347773343324661, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6601 + }, + { + "epoch": 5.273162939297125, + "grad_norm": 0.15056683123111725, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6602 + }, + { + "epoch": 5.273961661341853, + "grad_norm": 0.1398572027683258, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6603 + }, + { + "epoch": 5.274760383386582, + "grad_norm": 0.08360682427883148, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6604 + }, + { + "epoch": 5.27555910543131, + "grad_norm": 0.10360747575759888, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6605 + }, + { + "epoch": 5.276357827476039, + "grad_norm": 0.0864897072315216, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6606 + }, + { + "epoch": 5.277156549520766, + "grad_norm": 0.11505412310361862, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6607 + }, + { + "epoch": 5.277955271565495, + "grad_norm": 0.10638110339641571, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6608 + }, + { + "epoch": 5.2787539936102235, + "grad_norm": 0.08349479734897614, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6609 + }, + { + "epoch": 5.279552715654952, + "grad_norm": 0.14465951919555664, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6610 + }, + { + "epoch": 5.2803514376996805, + "grad_norm": 0.08049577474594116, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6611 + }, + { + "epoch": 5.281150159744409, + "grad_norm": 0.10206092149019241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6612 + }, + { + "epoch": 5.281948881789138, + "grad_norm": 0.2721571922302246, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6613 + }, + { + "epoch": 5.282747603833866, + "grad_norm": 0.17503346502780914, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6614 + }, + { + "epoch": 5.283546325878595, + "grad_norm": 0.11459292471408844, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6615 + }, + { + "epoch": 5.284345047923322, + "grad_norm": 0.9974967241287231, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6616 + }, + { + "epoch": 5.285143769968051, + "grad_norm": 0.11502816528081894, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6617 + }, + { + "epoch": 5.285942492012779, + "grad_norm": 0.12992256879806519, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6618 + }, + { + "epoch": 5.286741214057508, + "grad_norm": 0.19872024655342102, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6619 + }, + { + "epoch": 5.287539936102236, + "grad_norm": 0.13013097643852234, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6620 + }, + { + "epoch": 5.288338658146965, + "grad_norm": 0.13644525408744812, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6621 + }, + { + "epoch": 5.289137380191693, + "grad_norm": 0.15101996064186096, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6622 + }, + { + "epoch": 5.289936102236422, + "grad_norm": 0.11075131595134735, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6623 + }, + { + "epoch": 5.2907348242811505, + "grad_norm": 0.0904511958360672, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6624 + }, + { + "epoch": 5.291533546325878, + "grad_norm": 0.08861460536718369, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6625 + }, + { + "epoch": 5.292332268370607, + "grad_norm": 0.10443824529647827, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6626 + }, + { + "epoch": 5.293130990415335, + "grad_norm": 0.07440674304962158, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6627 + }, + { + "epoch": 5.293929712460064, + "grad_norm": 0.21709975600242615, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6628 + }, + { + "epoch": 5.294728434504792, + "grad_norm": 0.1281055063009262, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6629 + }, + { + "epoch": 5.295527156549521, + "grad_norm": 0.10365202277898788, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6630 + }, + { + "epoch": 5.296325878594249, + "grad_norm": 1.004258632659912, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6631 + }, + { + "epoch": 5.297124600638978, + "grad_norm": 0.16660870611667633, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6632 + }, + { + "epoch": 5.297923322683706, + "grad_norm": 0.1146734207868576, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6633 + }, + { + "epoch": 5.298722044728435, + "grad_norm": 0.18288104236125946, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6634 + }, + { + "epoch": 5.2995207667731625, + "grad_norm": 0.11469347029924393, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6635 + }, + { + "epoch": 5.300319488817891, + "grad_norm": 0.1333407461643219, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6636 + }, + { + "epoch": 5.30111821086262, + "grad_norm": 0.15359243750572205, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6637 + }, + { + "epoch": 5.301916932907348, + "grad_norm": 0.0832027792930603, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6638 + }, + { + "epoch": 5.302715654952077, + "grad_norm": 0.10231718420982361, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6639 + }, + { + "epoch": 5.303514376996805, + "grad_norm": 0.11031626909971237, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6640 + }, + { + "epoch": 5.304313099041534, + "grad_norm": 0.08014792948961258, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6641 + }, + { + "epoch": 5.305111821086262, + "grad_norm": 0.10066475719213486, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6642 + }, + { + "epoch": 5.305910543130991, + "grad_norm": 0.12824396789073944, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6643 + }, + { + "epoch": 5.306709265175719, + "grad_norm": 0.09452345222234726, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6644 + }, + { + "epoch": 5.307507987220447, + "grad_norm": 0.09100557118654251, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6645 + }, + { + "epoch": 5.3083067092651754, + "grad_norm": 0.07995713502168655, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6646 + }, + { + "epoch": 5.309105431309904, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6647 + }, + { + "epoch": 5.3099041533546325, + "grad_norm": 0.09881234914064407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6648 + }, + { + "epoch": 5.310702875399361, + "grad_norm": 0.08131393790245056, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6649 + }, + { + "epoch": 5.31150159744409, + "grad_norm": 0.08842889964580536, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6650 + }, + { + "epoch": 5.312300319488818, + "grad_norm": 0.12630115449428558, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6651 + }, + { + "epoch": 5.313099041533547, + "grad_norm": 0.13429711759090424, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6652 + }, + { + "epoch": 5.313897763578275, + "grad_norm": 0.11347261816263199, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6653 + }, + { + "epoch": 5.314696485623003, + "grad_norm": 0.1555728167295456, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6654 + }, + { + "epoch": 5.315495207667731, + "grad_norm": 0.13184282183647156, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6655 + }, + { + "epoch": 5.31629392971246, + "grad_norm": 0.07821093499660492, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6656 + }, + { + "epoch": 5.317092651757188, + "grad_norm": 0.1300499588251114, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6657 + }, + { + "epoch": 5.317891373801917, + "grad_norm": 0.14896781742572784, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6658 + }, + { + "epoch": 5.318690095846645, + "grad_norm": 0.13370175659656525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6659 + }, + { + "epoch": 5.319488817891374, + "grad_norm": 0.14055652916431427, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6660 + }, + { + "epoch": 5.3202875399361025, + "grad_norm": 0.11674464493989944, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6661 + }, + { + "epoch": 5.321086261980831, + "grad_norm": 0.13155756890773773, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6662 + }, + { + "epoch": 5.321884984025559, + "grad_norm": 0.09616535156965256, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6663 + }, + { + "epoch": 5.322683706070287, + "grad_norm": 0.4228188991546631, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6664 + }, + { + "epoch": 5.323482428115016, + "grad_norm": 0.10942913591861725, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6665 + }, + { + "epoch": 5.324281150159744, + "grad_norm": 0.15592730045318604, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6666 + }, + { + "epoch": 5.325079872204473, + "grad_norm": 0.16837753355503082, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6667 + }, + { + "epoch": 5.325878594249201, + "grad_norm": 0.10512012243270874, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6668 + }, + { + "epoch": 5.32667731629393, + "grad_norm": 0.10834471136331558, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6669 + }, + { + "epoch": 5.327476038338658, + "grad_norm": 0.06588451564311981, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6670 + }, + { + "epoch": 5.328274760383387, + "grad_norm": 0.08714822679758072, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6671 + }, + { + "epoch": 5.329073482428115, + "grad_norm": 0.16129685938358307, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6672 + }, + { + "epoch": 5.329872204472843, + "grad_norm": 0.09294751286506653, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6673 + }, + { + "epoch": 5.330670926517572, + "grad_norm": 0.09905052185058594, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6674 + }, + { + "epoch": 5.3314696485623, + "grad_norm": 0.14584603905677795, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6675 + }, + { + "epoch": 5.332268370607029, + "grad_norm": 0.08384378254413605, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6676 + }, + { + "epoch": 5.333067092651757, + "grad_norm": 0.1672045886516571, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6677 + }, + { + "epoch": 5.333865814696486, + "grad_norm": 0.21656489372253418, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6678 + }, + { + "epoch": 5.334664536741214, + "grad_norm": 0.17034684121608734, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6679 + }, + { + "epoch": 5.335463258785943, + "grad_norm": 0.3153417408466339, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6680 + }, + { + "epoch": 5.336261980830671, + "grad_norm": 0.1953393816947937, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6681 + }, + { + "epoch": 5.3370607028754, + "grad_norm": 0.2085847705602646, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6682 + }, + { + "epoch": 5.337859424920127, + "grad_norm": 0.2679558992385864, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6683 + }, + { + "epoch": 5.338658146964856, + "grad_norm": 0.08705966919660568, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6684 + }, + { + "epoch": 5.3394568690095845, + "grad_norm": 0.09011410176753998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6685 + }, + { + "epoch": 5.340255591054313, + "grad_norm": 0.10358326137065887, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6686 + }, + { + "epoch": 5.3410543130990416, + "grad_norm": 0.08191518485546112, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6687 + }, + { + "epoch": 5.34185303514377, + "grad_norm": 0.0676165446639061, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6688 + }, + { + "epoch": 5.342651757188499, + "grad_norm": 0.18006695806980133, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6689 + }, + { + "epoch": 5.343450479233227, + "grad_norm": 0.11935598403215408, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.344249201277956, + "grad_norm": 0.14136075973510742, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6691 + }, + { + "epoch": 5.345047923322683, + "grad_norm": 0.19367988407611847, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6692 + }, + { + "epoch": 5.345846645367412, + "grad_norm": 0.1283622533082962, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6693 + }, + { + "epoch": 5.34664536741214, + "grad_norm": 0.11303326487541199, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6694 + }, + { + "epoch": 5.347444089456869, + "grad_norm": 0.09076731652021408, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6695 + }, + { + "epoch": 5.348242811501597, + "grad_norm": 0.12625159323215485, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6696 + }, + { + "epoch": 5.349041533546326, + "grad_norm": 0.18254370987415314, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6697 + }, + { + "epoch": 5.3498402555910545, + "grad_norm": 0.12221173942089081, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6698 + }, + { + "epoch": 5.350638977635783, + "grad_norm": 0.11586996912956238, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6699 + }, + { + "epoch": 5.3514376996805115, + "grad_norm": 0.1012619286775589, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6700 + }, + { + "epoch": 5.352236421725239, + "grad_norm": 0.10728003084659576, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6701 + }, + { + "epoch": 5.353035143769968, + "grad_norm": 0.08077894896268845, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6702 + }, + { + "epoch": 5.353833865814696, + "grad_norm": 0.10069102048873901, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6703 + }, + { + "epoch": 5.354632587859425, + "grad_norm": 0.11007717996835709, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6704 + }, + { + "epoch": 5.355431309904153, + "grad_norm": 0.08088147640228271, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6705 + }, + { + "epoch": 5.356230031948882, + "grad_norm": 0.06969337165355682, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6706 + }, + { + "epoch": 5.35702875399361, + "grad_norm": 0.09731647372245789, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6707 + }, + { + "epoch": 5.357827476038339, + "grad_norm": 0.07404995709657669, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6708 + }, + { + "epoch": 5.358626198083067, + "grad_norm": 0.09361755102872849, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6709 + }, + { + "epoch": 5.359424920127796, + "grad_norm": 0.11929210275411606, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6710 + }, + { + "epoch": 5.360223642172524, + "grad_norm": 0.11107892543077469, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6711 + }, + { + "epoch": 5.361022364217252, + "grad_norm": 0.10966535657644272, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6712 + }, + { + "epoch": 5.361821086261981, + "grad_norm": 0.11830565333366394, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6713 + }, + { + "epoch": 5.362619808306709, + "grad_norm": 0.15130563080310822, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6714 + }, + { + "epoch": 5.363418530351438, + "grad_norm": 0.12608309090137482, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6715 + }, + { + "epoch": 5.364217252396166, + "grad_norm": 0.10768693685531616, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6716 + }, + { + "epoch": 5.365015974440895, + "grad_norm": 0.10020256787538528, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6717 + }, + { + "epoch": 5.365814696485623, + "grad_norm": 0.11352406442165375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6718 + }, + { + "epoch": 5.366613418530352, + "grad_norm": 0.10058535635471344, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6719 + }, + { + "epoch": 5.36741214057508, + "grad_norm": 0.08427922427654266, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6720 + }, + { + "epoch": 5.368210862619808, + "grad_norm": 0.08600196242332458, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6721 + }, + { + "epoch": 5.3690095846645365, + "grad_norm": 0.0891844630241394, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 6722 + }, + { + "epoch": 5.369808306709265, + "grad_norm": 0.07231339812278748, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6723 + }, + { + "epoch": 5.3706070287539935, + "grad_norm": 0.0866503193974495, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6724 + }, + { + "epoch": 5.371405750798722, + "grad_norm": 0.44905656576156616, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6725 + }, + { + "epoch": 5.372204472843451, + "grad_norm": 0.2192242592573166, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6726 + }, + { + "epoch": 5.373003194888179, + "grad_norm": 0.15841859579086304, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6727 + }, + { + "epoch": 5.373801916932908, + "grad_norm": 0.1254468858242035, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6728 + }, + { + "epoch": 5.374600638977636, + "grad_norm": 1.5675911903381348, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6729 + }, + { + "epoch": 5.375399361022364, + "grad_norm": 0.20507164299488068, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6730 + }, + { + "epoch": 5.376198083067092, + "grad_norm": 0.26948630809783936, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6731 + }, + { + "epoch": 5.376996805111821, + "grad_norm": 0.15447315573692322, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6732 + }, + { + "epoch": 5.377795527156549, + "grad_norm": 0.17888243496418, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6733 + }, + { + "epoch": 5.378594249201278, + "grad_norm": 0.24683290719985962, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6734 + }, + { + "epoch": 5.3793929712460065, + "grad_norm": 0.15786881744861603, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6735 + }, + { + "epoch": 5.380191693290735, + "grad_norm": 0.18426702916622162, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6736 + }, + { + "epoch": 5.3809904153354635, + "grad_norm": 0.14444448053836823, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6737 + }, + { + "epoch": 5.381789137380192, + "grad_norm": 0.135011225938797, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6738 + }, + { + "epoch": 5.38258785942492, + "grad_norm": 0.19057826697826385, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6739 + }, + { + "epoch": 5.383386581469648, + "grad_norm": 0.12282486259937286, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6740 + }, + { + "epoch": 5.384185303514377, + "grad_norm": 0.17092294991016388, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6741 + }, + { + "epoch": 5.384984025559105, + "grad_norm": 0.19800473749637604, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6742 + }, + { + "epoch": 5.385782747603834, + "grad_norm": 0.07987766712903976, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6743 + }, + { + "epoch": 5.386581469648562, + "grad_norm": 0.18386386334896088, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6744 + }, + { + "epoch": 5.387380191693291, + "grad_norm": 0.16529197990894318, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6745 + }, + { + "epoch": 5.388178913738019, + "grad_norm": 0.09607496112585068, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6746 + }, + { + "epoch": 5.388977635782748, + "grad_norm": 0.15966713428497314, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6747 + }, + { + "epoch": 5.389776357827476, + "grad_norm": 0.1622796356678009, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6748 + }, + { + "epoch": 5.390575079872204, + "grad_norm": 0.09537432342767715, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6749 + }, + { + "epoch": 5.391373801916933, + "grad_norm": 0.1766965389251709, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6750 + }, + { + "epoch": 5.392172523961661, + "grad_norm": 0.21354711055755615, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6751 + }, + { + "epoch": 5.39297124600639, + "grad_norm": 0.093564473092556, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6752 + }, + { + "epoch": 5.393769968051118, + "grad_norm": 0.14756347239017487, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6753 + }, + { + "epoch": 5.394568690095847, + "grad_norm": 0.10537468641996384, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6754 + }, + { + "epoch": 5.395367412140575, + "grad_norm": 0.15626567602157593, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6755 + }, + { + "epoch": 5.396166134185304, + "grad_norm": 0.16282637417316437, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6756 + }, + { + "epoch": 5.396964856230032, + "grad_norm": 0.0745241791009903, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6757 + }, + { + "epoch": 5.397763578274761, + "grad_norm": 0.1221894845366478, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6758 + }, + { + "epoch": 5.3985623003194885, + "grad_norm": 0.08314131945371628, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6759 + }, + { + "epoch": 5.399361022364217, + "grad_norm": 0.12707264721393585, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6760 + }, + { + "epoch": 5.4001597444089455, + "grad_norm": 0.12036006152629852, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6761 + }, + { + "epoch": 5.400958466453674, + "grad_norm": 0.12769176065921783, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6762 + }, + { + "epoch": 5.401757188498403, + "grad_norm": 0.2201661318540573, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6763 + }, + { + "epoch": 5.402555910543131, + "grad_norm": 0.15013982355594635, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6764 + }, + { + "epoch": 5.40335463258786, + "grad_norm": 0.7714766263961792, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6765 + }, + { + "epoch": 5.404153354632588, + "grad_norm": 0.20359933376312256, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6766 + }, + { + "epoch": 5.404952076677317, + "grad_norm": 0.12684984505176544, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6767 + }, + { + "epoch": 5.405750798722044, + "grad_norm": 0.09804195165634155, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6768 + }, + { + "epoch": 5.406549520766773, + "grad_norm": 0.10416880995035172, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6769 + }, + { + "epoch": 5.407348242811501, + "grad_norm": 0.1509416699409485, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6770 + }, + { + "epoch": 5.40814696485623, + "grad_norm": 0.15458443760871887, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6771 + }, + { + "epoch": 5.4089456869009584, + "grad_norm": 0.08355830609798431, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6772 + }, + { + "epoch": 5.409744408945687, + "grad_norm": 0.1228979080915451, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6773 + }, + { + "epoch": 5.4105431309904155, + "grad_norm": 0.12139632552862167, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6774 + }, + { + "epoch": 5.411341853035144, + "grad_norm": 0.16298502683639526, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6775 + }, + { + "epoch": 5.412140575079873, + "grad_norm": 0.09110788255929947, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6776 + }, + { + "epoch": 5.4129392971246, + "grad_norm": 0.08584781736135483, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6777 + }, + { + "epoch": 5.413738019169329, + "grad_norm": 0.10148828476667404, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6778 + }, + { + "epoch": 5.414536741214057, + "grad_norm": 0.1046212688088417, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6779 + }, + { + "epoch": 5.415335463258786, + "grad_norm": 0.12530827522277832, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6780 + }, + { + "epoch": 5.416134185303514, + "grad_norm": 0.07337464392185211, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6781 + }, + { + "epoch": 5.416932907348243, + "grad_norm": 0.10839185118675232, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6782 + }, + { + "epoch": 5.417731629392971, + "grad_norm": 0.07784926891326904, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6783 + }, + { + "epoch": 5.4185303514377, + "grad_norm": 0.08692190796136856, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6784 + }, + { + "epoch": 5.419329073482428, + "grad_norm": 0.08721921592950821, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6785 + }, + { + "epoch": 5.420127795527157, + "grad_norm": 0.09581280499696732, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6786 + }, + { + "epoch": 5.420926517571885, + "grad_norm": 0.1156916618347168, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6787 + }, + { + "epoch": 5.421725239616613, + "grad_norm": 0.4520327150821686, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6788 + }, + { + "epoch": 5.422523961661342, + "grad_norm": 0.0948205217719078, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6789 + }, + { + "epoch": 5.42332268370607, + "grad_norm": 0.07208927720785141, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6790 + }, + { + "epoch": 5.424121405750799, + "grad_norm": 0.06830724328756332, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6791 + }, + { + "epoch": 5.424920127795527, + "grad_norm": 0.10488666594028473, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6792 + }, + { + "epoch": 5.425718849840256, + "grad_norm": 0.08509235084056854, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6793 + }, + { + "epoch": 5.426517571884984, + "grad_norm": 0.09133832901716232, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6794 + }, + { + "epoch": 5.427316293929713, + "grad_norm": 0.11715687066316605, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6795 + }, + { + "epoch": 5.428115015974441, + "grad_norm": 0.1196032389998436, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6796 + }, + { + "epoch": 5.428913738019169, + "grad_norm": 0.14141549170017242, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6797 + }, + { + "epoch": 5.4297124600638975, + "grad_norm": 0.12866206467151642, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6798 + }, + { + "epoch": 5.430511182108626, + "grad_norm": 0.10802716016769409, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6799 + }, + { + "epoch": 5.431309904153355, + "grad_norm": 0.10947239398956299, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6800 + }, + { + "epoch": 5.432108626198083, + "grad_norm": 0.08339721709489822, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6801 + }, + { + "epoch": 5.432907348242812, + "grad_norm": 0.12407296150922775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6802 + }, + { + "epoch": 5.43370607028754, + "grad_norm": 0.10537894070148468, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6803 + }, + { + "epoch": 5.434504792332269, + "grad_norm": 0.0920059084892273, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6804 + }, + { + "epoch": 5.435303514376997, + "grad_norm": 0.1502516269683838, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6805 + }, + { + "epoch": 5.436102236421725, + "grad_norm": 0.2798864245414734, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 6806 + }, + { + "epoch": 5.436900958466453, + "grad_norm": 0.11037585884332657, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6807 + }, + { + "epoch": 5.437699680511182, + "grad_norm": 0.12594881653785706, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6808 + }, + { + "epoch": 5.43849840255591, + "grad_norm": 0.09976109862327576, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6809 + }, + { + "epoch": 5.439297124600639, + "grad_norm": 0.3285512328147888, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6810 + }, + { + "epoch": 5.4400958466453675, + "grad_norm": 0.49450287222862244, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6811 + }, + { + "epoch": 5.440894568690096, + "grad_norm": 0.06817556917667389, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6812 + }, + { + "epoch": 5.4416932907348246, + "grad_norm": 0.14917057752609253, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6813 + }, + { + "epoch": 5.442492012779553, + "grad_norm": 0.10008134692907333, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6814 + }, + { + "epoch": 5.443290734824281, + "grad_norm": 0.07854767143726349, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6815 + }, + { + "epoch": 5.444089456869009, + "grad_norm": 0.2441248893737793, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6816 + }, + { + "epoch": 5.444888178913738, + "grad_norm": 0.1276157647371292, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6817 + }, + { + "epoch": 5.445686900958466, + "grad_norm": 0.11779431253671646, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6818 + }, + { + "epoch": 5.446485623003195, + "grad_norm": 0.11788108944892883, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6819 + }, + { + "epoch": 5.447284345047923, + "grad_norm": 0.06554995477199554, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6820 + }, + { + "epoch": 5.448083067092652, + "grad_norm": 0.07937108725309372, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6821 + }, + { + "epoch": 5.44888178913738, + "grad_norm": 0.08041426539421082, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6822 + }, + { + "epoch": 5.449680511182109, + "grad_norm": 0.12429161369800568, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6823 + }, + { + "epoch": 5.4504792332268375, + "grad_norm": 0.09993165731430054, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6824 + }, + { + "epoch": 5.451277955271565, + "grad_norm": 0.07077670097351074, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6825 + }, + { + "epoch": 5.452076677316294, + "grad_norm": 0.12163005024194717, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6826 + }, + { + "epoch": 5.452875399361022, + "grad_norm": 0.19080819189548492, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6827 + }, + { + "epoch": 5.453674121405751, + "grad_norm": 0.06450853496789932, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6828 + }, + { + "epoch": 5.454472843450479, + "grad_norm": 0.8893078565597534, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6829 + }, + { + "epoch": 5.455271565495208, + "grad_norm": 0.08225185424089432, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6830 + }, + { + "epoch": 5.456070287539936, + "grad_norm": 0.08631845563650131, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6831 + }, + { + "epoch": 5.456869009584665, + "grad_norm": 0.1858949214220047, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6832 + }, + { + "epoch": 5.457667731629393, + "grad_norm": 0.10997786372900009, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6833 + }, + { + "epoch": 5.458466453674122, + "grad_norm": 0.09691416472196579, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6834 + }, + { + "epoch": 5.4592651757188495, + "grad_norm": 0.12523561716079712, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6835 + }, + { + "epoch": 5.460063897763578, + "grad_norm": 0.10094364732503891, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6836 + }, + { + "epoch": 5.460862619808307, + "grad_norm": 0.06598310172557831, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6837 + }, + { + "epoch": 5.461661341853035, + "grad_norm": 0.10221479833126068, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6838 + }, + { + "epoch": 5.462460063897764, + "grad_norm": 0.6545975804328918, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6839 + }, + { + "epoch": 5.463258785942492, + "grad_norm": 0.12167128920555115, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6840 + }, + { + "epoch": 5.464057507987221, + "grad_norm": 0.10822924226522446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6841 + }, + { + "epoch": 5.464856230031949, + "grad_norm": 0.11905575543642044, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6842 + }, + { + "epoch": 5.465654952076678, + "grad_norm": 0.10276103764772415, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6843 + }, + { + "epoch": 5.466453674121405, + "grad_norm": 0.09087378531694412, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6844 + }, + { + "epoch": 5.467252396166134, + "grad_norm": 0.13117510080337524, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6845 + }, + { + "epoch": 5.468051118210862, + "grad_norm": 0.14824305474758148, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6846 + }, + { + "epoch": 5.468849840255591, + "grad_norm": 0.08553508669137955, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6847 + }, + { + "epoch": 5.4696485623003195, + "grad_norm": 0.12209141999483109, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6848 + }, + { + "epoch": 5.470447284345048, + "grad_norm": 0.1992058902978897, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6849 + }, + { + "epoch": 5.4712460063897765, + "grad_norm": 0.08518865704536438, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6850 + }, + { + "epoch": 5.472044728434505, + "grad_norm": 0.10496464371681213, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6851 + }, + { + "epoch": 5.472843450479234, + "grad_norm": 0.08789866417646408, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6852 + }, + { + "epoch": 5.473642172523961, + "grad_norm": 0.08592598885297775, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6853 + }, + { + "epoch": 5.47444089456869, + "grad_norm": 0.061165813356637955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6854 + }, + { + "epoch": 5.475239616613418, + "grad_norm": 0.06936467438936234, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6855 + }, + { + "epoch": 5.476038338658147, + "grad_norm": 0.20519734919071198, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6856 + }, + { + "epoch": 5.476837060702875, + "grad_norm": 0.087073415517807, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6857 + }, + { + "epoch": 5.477635782747604, + "grad_norm": 0.10153642296791077, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6858 + }, + { + "epoch": 5.478434504792332, + "grad_norm": 0.12416163831949234, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6859 + }, + { + "epoch": 5.479233226837061, + "grad_norm": 0.1047174334526062, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6860 + }, + { + "epoch": 5.4800319488817895, + "grad_norm": 0.13690868020057678, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6861 + }, + { + "epoch": 5.480830670926517, + "grad_norm": 0.15995970368385315, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6862 + }, + { + "epoch": 5.481629392971246, + "grad_norm": 0.08172900229692459, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6863 + }, + { + "epoch": 5.482428115015974, + "grad_norm": 0.10956761986017227, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6864 + }, + { + "epoch": 5.483226837060703, + "grad_norm": 0.12259931862354279, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6865 + }, + { + "epoch": 5.484025559105431, + "grad_norm": 0.08295698463916779, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6866 + }, + { + "epoch": 5.48482428115016, + "grad_norm": 0.10935505479574203, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6867 + }, + { + "epoch": 5.485623003194888, + "grad_norm": 0.12436006963253021, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6868 + }, + { + "epoch": 5.486421725239617, + "grad_norm": 0.08449307829141617, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6869 + }, + { + "epoch": 5.487220447284345, + "grad_norm": 0.10897113382816315, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6870 + }, + { + "epoch": 5.488019169329074, + "grad_norm": 0.06856910139322281, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6871 + }, + { + "epoch": 5.488817891373802, + "grad_norm": 0.07105988264083862, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6872 + }, + { + "epoch": 5.48961661341853, + "grad_norm": 0.08778723329305649, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6873 + }, + { + "epoch": 5.4904153354632586, + "grad_norm": 0.07818275690078735, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6874 + }, + { + "epoch": 5.491214057507987, + "grad_norm": 0.08410139381885529, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6875 + }, + { + "epoch": 5.492012779552716, + "grad_norm": 0.0804608166217804, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6876 + }, + { + "epoch": 5.492811501597444, + "grad_norm": 0.10089578479528427, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6877 + }, + { + "epoch": 5.493610223642173, + "grad_norm": 0.08231056481599808, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6878 + }, + { + "epoch": 5.494408945686901, + "grad_norm": 0.07642059773206711, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6879 + }, + { + "epoch": 5.49520766773163, + "grad_norm": 0.11312755942344666, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6880 + }, + { + "epoch": 5.496006389776358, + "grad_norm": 0.06288543343544006, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6881 + }, + { + "epoch": 5.496805111821086, + "grad_norm": 0.09648934751749039, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6882 + }, + { + "epoch": 5.497603833865814, + "grad_norm": 0.09374719858169556, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6883 + }, + { + "epoch": 5.498402555910543, + "grad_norm": 0.10596928000450134, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6884 + }, + { + "epoch": 5.4992012779552715, + "grad_norm": 0.06540077924728394, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6885 + }, + { + "epoch": 5.5, + "grad_norm": 0.05208199843764305, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6886 + }, + { + "epoch": 5.5007987220447285, + "grad_norm": 0.10762238502502441, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6887 + }, + { + "epoch": 5.501597444089457, + "grad_norm": 0.122553251683712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6888 + }, + { + "epoch": 5.502396166134186, + "grad_norm": 0.07663412392139435, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6889 + }, + { + "epoch": 5.503194888178914, + "grad_norm": 0.09100968390703201, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6890 + }, + { + "epoch": 5.503993610223642, + "grad_norm": 0.24931807816028595, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6891 + }, + { + "epoch": 5.50479233226837, + "grad_norm": 0.07812821120023727, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6892 + }, + { + "epoch": 5.505591054313099, + "grad_norm": 0.04760657623410225, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6893 + }, + { + "epoch": 5.506389776357827, + "grad_norm": 0.08183290809392929, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6894 + }, + { + "epoch": 5.507188498402556, + "grad_norm": 0.09541092067956924, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6895 + }, + { + "epoch": 5.507987220447284, + "grad_norm": 0.04168708249926567, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6896 + }, + { + "epoch": 5.508785942492013, + "grad_norm": 0.07038994133472443, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6897 + }, + { + "epoch": 5.5095846645367414, + "grad_norm": 0.060375142842531204, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6898 + }, + { + "epoch": 5.51038338658147, + "grad_norm": 0.048829223960638046, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6899 + }, + { + "epoch": 5.511182108626198, + "grad_norm": 0.057894766330718994, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6900 + }, + { + "epoch": 5.511980830670926, + "grad_norm": 0.05786101892590523, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6901 + }, + { + "epoch": 5.512779552715655, + "grad_norm": 0.07246953994035721, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6902 + }, + { + "epoch": 5.513578274760383, + "grad_norm": 0.07493462413549423, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6903 + }, + { + "epoch": 5.514376996805112, + "grad_norm": 0.060612600296735764, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6904 + }, + { + "epoch": 5.51517571884984, + "grad_norm": 0.0666302740573883, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6905 + }, + { + "epoch": 5.515974440894569, + "grad_norm": 0.08713024109601974, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6906 + }, + { + "epoch": 5.516773162939297, + "grad_norm": 0.31083860993385315, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6907 + }, + { + "epoch": 5.517571884984026, + "grad_norm": 0.0808933675289154, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6908 + }, + { + "epoch": 5.518370607028754, + "grad_norm": 0.1312016248703003, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6909 + }, + { + "epoch": 5.519169329073483, + "grad_norm": 0.20448890328407288, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6910 + }, + { + "epoch": 5.5199680511182105, + "grad_norm": 0.2519006133079529, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6911 + }, + { + "epoch": 5.520766773162939, + "grad_norm": 0.11359903216362, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6912 + }, + { + "epoch": 5.521565495207668, + "grad_norm": 0.07498760521411896, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6913 + }, + { + "epoch": 5.522364217252396, + "grad_norm": 0.06599561125040054, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6914 + }, + { + "epoch": 5.523162939297125, + "grad_norm": 0.08988697826862335, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6915 + }, + { + "epoch": 5.523961661341853, + "grad_norm": 0.06968241930007935, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6916 + }, + { + "epoch": 5.524760383386582, + "grad_norm": 0.07231415063142776, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6917 + }, + { + "epoch": 5.52555910543131, + "grad_norm": 0.07369428128004074, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6918 + }, + { + "epoch": 5.526357827476039, + "grad_norm": 0.07677069306373596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6919 + }, + { + "epoch": 5.527156549520766, + "grad_norm": 0.07391869276762009, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6920 + }, + { + "epoch": 5.527955271565495, + "grad_norm": 0.05270293354988098, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6921 + }, + { + "epoch": 5.5287539936102235, + "grad_norm": 0.10439106076955795, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6922 + }, + { + "epoch": 5.529552715654952, + "grad_norm": 0.06968904286623001, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6923 + }, + { + "epoch": 5.5303514376996805, + "grad_norm": 0.08401032537221909, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6924 + }, + { + "epoch": 5.531150159744409, + "grad_norm": 0.11993245035409927, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6925 + }, + { + "epoch": 5.531948881789138, + "grad_norm": 0.05857640504837036, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6926 + }, + { + "epoch": 5.532747603833866, + "grad_norm": 0.10513442009687424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6927 + }, + { + "epoch": 5.533546325878595, + "grad_norm": 0.12233056873083115, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6928 + }, + { + "epoch": 5.534345047923322, + "grad_norm": 0.06959997117519379, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6929 + }, + { + "epoch": 5.535143769968051, + "grad_norm": 0.08057182282209396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6930 + }, + { + "epoch": 5.535942492012779, + "grad_norm": 0.09816458821296692, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6931 + }, + { + "epoch": 5.536741214057508, + "grad_norm": 0.055738940834999084, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6932 + }, + { + "epoch": 5.537539936102236, + "grad_norm": 0.0939234122633934, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6933 + }, + { + "epoch": 5.538338658146965, + "grad_norm": 0.12143029272556305, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6934 + }, + { + "epoch": 5.539137380191693, + "grad_norm": 0.08409210294485092, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6935 + }, + { + "epoch": 5.539936102236422, + "grad_norm": 0.10690448433160782, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6936 + }, + { + "epoch": 5.5407348242811505, + "grad_norm": 0.20701836049556732, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6937 + }, + { + "epoch": 5.541533546325878, + "grad_norm": 0.09124163538217545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6938 + }, + { + "epoch": 5.542332268370607, + "grad_norm": 0.08295103162527084, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6939 + }, + { + "epoch": 5.543130990415335, + "grad_norm": 0.1179230809211731, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6940 + }, + { + "epoch": 5.543929712460064, + "grad_norm": 0.12345689535140991, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6941 + }, + { + "epoch": 5.544728434504792, + "grad_norm": 0.052616000175476074, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6942 + }, + { + "epoch": 5.545527156549521, + "grad_norm": 0.07918131351470947, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6943 + }, + { + "epoch": 5.546325878594249, + "grad_norm": 0.04847119748592377, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6944 + }, + { + "epoch": 5.547124600638978, + "grad_norm": 0.06204143166542053, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6945 + }, + { + "epoch": 5.547923322683706, + "grad_norm": 0.07778293639421463, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6946 + }, + { + "epoch": 5.548722044728435, + "grad_norm": 0.05037623643875122, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6947 + }, + { + "epoch": 5.549520766773163, + "grad_norm": 0.09024710208177567, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 6948 + }, + { + "epoch": 5.550319488817891, + "grad_norm": 0.0872211754322052, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6949 + }, + { + "epoch": 5.55111821086262, + "grad_norm": 0.08456625789403915, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6950 + }, + { + "epoch": 5.551916932907348, + "grad_norm": 0.054692018777132034, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6951 + }, + { + "epoch": 5.552715654952077, + "grad_norm": 0.10690787434577942, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6952 + }, + { + "epoch": 5.553514376996805, + "grad_norm": 0.07764400541782379, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6953 + }, + { + "epoch": 5.554313099041534, + "grad_norm": 0.08423051983118057, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6954 + }, + { + "epoch": 5.555111821086262, + "grad_norm": 0.06771727651357651, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6955 + }, + { + "epoch": 5.555910543130991, + "grad_norm": 0.10505887866020203, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6956 + }, + { + "epoch": 5.556709265175719, + "grad_norm": 0.054641906172037125, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6957 + }, + { + "epoch": 5.557507987220447, + "grad_norm": 0.05115118622779846, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6958 + }, + { + "epoch": 5.5583067092651754, + "grad_norm": 0.07177245616912842, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6959 + }, + { + "epoch": 5.559105431309904, + "grad_norm": 0.06642751395702362, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6960 + }, + { + "epoch": 5.5599041533546325, + "grad_norm": 0.08428867161273956, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6961 + }, + { + "epoch": 5.560702875399361, + "grad_norm": 0.044375378638505936, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6962 + }, + { + "epoch": 5.56150159744409, + "grad_norm": 0.06384986639022827, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6963 + }, + { + "epoch": 5.562300319488818, + "grad_norm": 0.052885912358760834, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6964 + }, + { + "epoch": 5.563099041533547, + "grad_norm": 0.05244029313325882, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6965 + }, + { + "epoch": 5.563897763578275, + "grad_norm": 0.1781054139137268, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6966 + }, + { + "epoch": 5.564696485623003, + "grad_norm": 0.8067191243171692, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6967 + }, + { + "epoch": 5.565495207667731, + "grad_norm": 0.0759076327085495, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6968 + }, + { + "epoch": 5.56629392971246, + "grad_norm": 0.0820186585187912, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6969 + }, + { + "epoch": 5.567092651757188, + "grad_norm": 2.901848316192627, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6970 + }, + { + "epoch": 5.567891373801917, + "grad_norm": 0.5663259625434875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6971 + }, + { + "epoch": 5.568690095846645, + "grad_norm": 0.34909728169441223, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6972 + }, + { + "epoch": 5.569488817891374, + "grad_norm": 0.3031843602657318, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6973 + }, + { + "epoch": 5.5702875399361025, + "grad_norm": 0.9258882403373718, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6974 + }, + { + "epoch": 5.571086261980831, + "grad_norm": 0.37162891030311584, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6975 + }, + { + "epoch": 5.571884984025559, + "grad_norm": 0.11269918829202652, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6976 + }, + { + "epoch": 5.572683706070287, + "grad_norm": 0.20953021943569183, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6977 + }, + { + "epoch": 5.573482428115016, + "grad_norm": 0.22324982285499573, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6978 + }, + { + "epoch": 5.574281150159744, + "grad_norm": 0.47017180919647217, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6979 + }, + { + "epoch": 5.575079872204473, + "grad_norm": 0.22266747057437897, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 6980 + }, + { + "epoch": 5.575878594249201, + "grad_norm": 0.1609373688697815, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6981 + }, + { + "epoch": 5.57667731629393, + "grad_norm": 0.17458784580230713, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6982 + }, + { + "epoch": 5.577476038338658, + "grad_norm": 0.17354144155979156, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6983 + }, + { + "epoch": 5.578274760383387, + "grad_norm": 0.10959888994693756, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6984 + }, + { + "epoch": 5.5790734824281145, + "grad_norm": 0.22630754113197327, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6985 + }, + { + "epoch": 5.579872204472844, + "grad_norm": 0.3786774277687073, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6986 + }, + { + "epoch": 5.580670926517572, + "grad_norm": 0.13818539679050446, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 6987 + }, + { + "epoch": 5.5814696485623, + "grad_norm": 0.22202269732952118, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6988 + }, + { + "epoch": 5.582268370607029, + "grad_norm": 0.08324426412582397, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6989 + }, + { + "epoch": 5.583067092651757, + "grad_norm": 0.16399513185024261, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6990 + }, + { + "epoch": 5.583865814696486, + "grad_norm": 0.13956478238105774, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6991 + }, + { + "epoch": 5.584664536741214, + "grad_norm": 0.09159751981496811, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6992 + }, + { + "epoch": 5.585463258785943, + "grad_norm": 0.19404387474060059, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6993 + }, + { + "epoch": 5.586261980830671, + "grad_norm": 0.07866083085536957, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6994 + }, + { + "epoch": 5.5870607028754, + "grad_norm": 0.10653684288263321, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6995 + }, + { + "epoch": 5.587859424920127, + "grad_norm": 0.12254250794649124, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6996 + }, + { + "epoch": 5.588658146964856, + "grad_norm": 0.0665711760520935, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6997 + }, + { + "epoch": 5.5894568690095845, + "grad_norm": 0.1234782338142395, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6998 + }, + { + "epoch": 5.590255591054313, + "grad_norm": 0.10345113277435303, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6999 + }, + { + "epoch": 5.5910543130990416, + "grad_norm": 0.10187766700983047, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7000 + }, + { + "epoch": 5.59185303514377, + "grad_norm": 0.10330864042043686, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7001 + }, + { + "epoch": 5.592651757188499, + "grad_norm": 0.12427254766225815, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7002 + }, + { + "epoch": 5.593450479233227, + "grad_norm": 0.06854265183210373, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7003 + }, + { + "epoch": 5.594249201277956, + "grad_norm": 0.07029487192630768, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7004 + }, + { + "epoch": 5.595047923322683, + "grad_norm": 0.07483061403036118, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7005 + }, + { + "epoch": 5.595846645367412, + "grad_norm": 0.08542168885469437, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7006 + }, + { + "epoch": 5.59664536741214, + "grad_norm": 0.05537399277091026, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7007 + }, + { + "epoch": 5.597444089456869, + "grad_norm": 0.28531956672668457, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7008 + }, + { + "epoch": 5.598242811501597, + "grad_norm": 0.1349600851535797, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7009 + }, + { + "epoch": 5.599041533546326, + "grad_norm": 0.06000711768865585, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7010 + }, + { + "epoch": 5.5998402555910545, + "grad_norm": 0.08139210939407349, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 7011 + }, + { + "epoch": 5.600638977635783, + "grad_norm": 0.08603602647781372, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7012 + }, + { + "epoch": 5.6014376996805115, + "grad_norm": 0.06586270034313202, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7013 + }, + { + "epoch": 5.602236421725239, + "grad_norm": 0.06276310235261917, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7014 + }, + { + "epoch": 5.603035143769968, + "grad_norm": 0.06072620674967766, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7015 + }, + { + "epoch": 5.603833865814696, + "grad_norm": 0.07509211450815201, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7016 + }, + { + "epoch": 5.604632587859425, + "grad_norm": 0.07241938263177872, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7017 + }, + { + "epoch": 5.605431309904153, + "grad_norm": 0.05110672488808632, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7018 + }, + { + "epoch": 5.606230031948882, + "grad_norm": 0.043005820363759995, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7019 + }, + { + "epoch": 5.60702875399361, + "grad_norm": 0.06298743188381195, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7020 + }, + { + "epoch": 5.607827476038339, + "grad_norm": 0.09457913786172867, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7021 + }, + { + "epoch": 5.608626198083067, + "grad_norm": 0.08066218346357346, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7022 + }, + { + "epoch": 5.609424920127795, + "grad_norm": 0.0845603421330452, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7023 + }, + { + "epoch": 5.6102236421725244, + "grad_norm": 0.09121926873922348, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7024 + }, + { + "epoch": 5.611022364217252, + "grad_norm": 0.12013491243124008, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7025 + }, + { + "epoch": 5.611821086261981, + "grad_norm": 0.062171660363674164, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7026 + }, + { + "epoch": 5.612619808306709, + "grad_norm": 0.05688954144716263, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7027 + }, + { + "epoch": 5.613418530351438, + "grad_norm": 0.049224793910980225, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7028 + }, + { + "epoch": 5.614217252396166, + "grad_norm": 0.06337599456310272, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7029 + }, + { + "epoch": 5.615015974440895, + "grad_norm": 0.03602084144949913, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7030 + }, + { + "epoch": 5.615814696485623, + "grad_norm": 0.06257645785808563, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7031 + }, + { + "epoch": 5.616613418530352, + "grad_norm": 0.09524381905794144, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7032 + }, + { + "epoch": 5.61741214057508, + "grad_norm": 0.06262468546628952, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7033 + }, + { + "epoch": 5.618210862619808, + "grad_norm": 0.23001722991466522, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7034 + }, + { + "epoch": 5.6190095846645365, + "grad_norm": 0.06312809139490128, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7035 + }, + { + "epoch": 5.619808306709265, + "grad_norm": 0.055973440408706665, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7036 + }, + { + "epoch": 5.6206070287539935, + "grad_norm": 0.0943455770611763, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7037 + }, + { + "epoch": 5.621405750798722, + "grad_norm": 0.05577901378273964, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7038 + }, + { + "epoch": 5.622204472843451, + "grad_norm": 0.057599395513534546, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7039 + }, + { + "epoch": 5.623003194888179, + "grad_norm": 0.07785748690366745, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7040 + }, + { + "epoch": 5.623801916932908, + "grad_norm": 0.04796557500958443, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7041 + }, + { + "epoch": 5.624600638977636, + "grad_norm": 0.19438667595386505, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7042 + }, + { + "epoch": 5.625399361022364, + "grad_norm": 0.10055433958768845, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7043 + }, + { + "epoch": 5.626198083067092, + "grad_norm": 0.06082126125693321, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7044 + }, + { + "epoch": 5.626996805111821, + "grad_norm": 0.07862866669893265, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7045 + }, + { + "epoch": 5.627795527156549, + "grad_norm": 0.09042234718799591, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7046 + }, + { + "epoch": 5.628594249201278, + "grad_norm": 0.06087128072977066, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7047 + }, + { + "epoch": 5.6293929712460065, + "grad_norm": 0.04091280326247215, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7048 + }, + { + "epoch": 5.630191693290735, + "grad_norm": 0.0625537633895874, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7049 + }, + { + "epoch": 5.6309904153354635, + "grad_norm": 0.04506808891892433, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7050 + }, + { + "epoch": 5.631789137380192, + "grad_norm": 0.0750357061624527, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7051 + }, + { + "epoch": 5.63258785942492, + "grad_norm": 0.06990372389554977, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7052 + }, + { + "epoch": 5.633386581469648, + "grad_norm": 0.05008876323699951, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7053 + }, + { + "epoch": 5.634185303514377, + "grad_norm": 0.07472547143697739, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7054 + }, + { + "epoch": 5.634984025559105, + "grad_norm": 0.04004117101430893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7055 + }, + { + "epoch": 5.635782747603834, + "grad_norm": 0.10103464871644974, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7056 + }, + { + "epoch": 5.636581469648562, + "grad_norm": 0.10850277543067932, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7057 + }, + { + "epoch": 5.637380191693291, + "grad_norm": 0.1109318807721138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7058 + }, + { + "epoch": 5.638178913738019, + "grad_norm": 0.06371457874774933, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7059 + }, + { + "epoch": 5.638977635782748, + "grad_norm": 0.1320749819278717, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7060 + }, + { + "epoch": 5.6397763578274756, + "grad_norm": 0.11957977712154388, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7061 + }, + { + "epoch": 5.640575079872205, + "grad_norm": 0.10327479988336563, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7062 + }, + { + "epoch": 5.641373801916933, + "grad_norm": 0.09731981158256531, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7063 + }, + { + "epoch": 5.642172523961661, + "grad_norm": 0.10276936739683151, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7064 + }, + { + "epoch": 5.64297124600639, + "grad_norm": 0.06973864883184433, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7065 + }, + { + "epoch": 5.643769968051118, + "grad_norm": 0.12020955234766006, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7066 + }, + { + "epoch": 5.644568690095847, + "grad_norm": 0.15950947999954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7067 + }, + { + "epoch": 5.645367412140575, + "grad_norm": 0.08034086227416992, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7068 + }, + { + "epoch": 5.646166134185304, + "grad_norm": 0.11269761621952057, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7069 + }, + { + "epoch": 5.646964856230032, + "grad_norm": 0.1569385826587677, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7070 + }, + { + "epoch": 5.647763578274761, + "grad_norm": 0.09290867298841476, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7071 + }, + { + "epoch": 5.6485623003194885, + "grad_norm": 0.0742817223072052, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7072 + }, + { + "epoch": 5.649361022364217, + "grad_norm": 0.3531377911567688, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7073 + }, + { + "epoch": 5.6501597444089455, + "grad_norm": 0.05365251749753952, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7074 + }, + { + "epoch": 5.650958466453674, + "grad_norm": 0.10185245424509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7075 + }, + { + "epoch": 5.651757188498403, + "grad_norm": 0.08978144079446793, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7076 + }, + { + "epoch": 5.652555910543131, + "grad_norm": 0.06563816964626312, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7077 + }, + { + "epoch": 5.65335463258786, + "grad_norm": 0.11167218536138535, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7078 + }, + { + "epoch": 5.654153354632588, + "grad_norm": 0.10078081488609314, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7079 + }, + { + "epoch": 5.654952076677317, + "grad_norm": 0.04581546410918236, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7080 + }, + { + "epoch": 5.655750798722044, + "grad_norm": 0.04128880053758621, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7081 + }, + { + "epoch": 5.656549520766773, + "grad_norm": 0.0887683555483818, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7082 + }, + { + "epoch": 5.657348242811501, + "grad_norm": 0.06673122197389603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7083 + }, + { + "epoch": 5.65814696485623, + "grad_norm": 0.12348195165395737, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7084 + }, + { + "epoch": 5.6589456869009584, + "grad_norm": 0.04828948527574539, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7085 + }, + { + "epoch": 5.659744408945687, + "grad_norm": 0.09094297885894775, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7086 + }, + { + "epoch": 5.6605431309904155, + "grad_norm": 0.05775933712720871, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7087 + }, + { + "epoch": 5.661341853035144, + "grad_norm": 0.06460239738225937, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7088 + }, + { + "epoch": 5.662140575079873, + "grad_norm": 0.07246532291173935, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7089 + }, + { + "epoch": 5.6629392971246, + "grad_norm": 0.05635413900017738, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7090 + }, + { + "epoch": 5.663738019169329, + "grad_norm": 0.05866781249642372, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7091 + }, + { + "epoch": 5.664536741214057, + "grad_norm": 0.11024738848209381, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7092 + }, + { + "epoch": 5.665335463258786, + "grad_norm": 2.880472421646118, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7093 + }, + { + "epoch": 5.666134185303514, + "grad_norm": 0.147624671459198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7094 + }, + { + "epoch": 5.666932907348243, + "grad_norm": 0.16042540967464447, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7095 + }, + { + "epoch": 5.667731629392971, + "grad_norm": 0.044081881642341614, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7096 + }, + { + "epoch": 5.6685303514377, + "grad_norm": 0.1580066829919815, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7097 + }, + { + "epoch": 5.669329073482428, + "grad_norm": 0.1348607987165451, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7098 + }, + { + "epoch": 5.670127795527156, + "grad_norm": 0.06525023281574249, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7099 + }, + { + "epoch": 5.6709265175718855, + "grad_norm": 0.12954704463481903, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7100 + }, + { + "epoch": 5.671725239616613, + "grad_norm": 0.09241525083780289, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7101 + }, + { + "epoch": 5.672523961661342, + "grad_norm": 0.05581163614988327, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7102 + }, + { + "epoch": 5.67332268370607, + "grad_norm": 0.0864885225892067, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7103 + }, + { + "epoch": 5.674121405750799, + "grad_norm": 0.0783633440732956, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7104 + }, + { + "epoch": 5.674920127795527, + "grad_norm": 2.419416666030884, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7105 + }, + { + "epoch": 5.675718849840256, + "grad_norm": 0.30067741870880127, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7106 + }, + { + "epoch": 5.676517571884984, + "grad_norm": 0.2876960337162018, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 7107 + }, + { + "epoch": 5.677316293929713, + "grad_norm": 0.13828304409980774, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7108 + }, + { + "epoch": 5.678115015974441, + "grad_norm": 0.12691721320152283, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7109 + }, + { + "epoch": 5.678913738019169, + "grad_norm": 0.18356311321258545, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 7110 + }, + { + "epoch": 5.6797124600638975, + "grad_norm": 0.13121426105499268, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7111 + }, + { + "epoch": 5.680511182108626, + "grad_norm": 0.13354304432868958, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7112 + }, + { + "epoch": 5.681309904153355, + "grad_norm": 0.10858450084924698, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7113 + }, + { + "epoch": 5.682108626198083, + "grad_norm": 0.12026678770780563, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 7114 + }, + { + "epoch": 5.682907348242812, + "grad_norm": 0.10297723114490509, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7115 + }, + { + "epoch": 5.68370607028754, + "grad_norm": 0.10481604188680649, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7116 + }, + { + "epoch": 5.684504792332269, + "grad_norm": 0.1389889419078827, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7117 + }, + { + "epoch": 5.685303514376997, + "grad_norm": 0.047913264483213425, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7118 + }, + { + "epoch": 5.686102236421725, + "grad_norm": 0.07504977285861969, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7119 + }, + { + "epoch": 5.686900958466453, + "grad_norm": 0.08858702331781387, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7120 + }, + { + "epoch": 5.687699680511182, + "grad_norm": 0.07746905088424683, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7121 + }, + { + "epoch": 5.68849840255591, + "grad_norm": 0.20370569825172424, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7122 + }, + { + "epoch": 5.689297124600639, + "grad_norm": 0.053284503519535065, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7123 + }, + { + "epoch": 5.6900958466453675, + "grad_norm": 0.08579347282648087, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7124 + }, + { + "epoch": 5.690894568690096, + "grad_norm": 0.11220933496952057, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7125 + }, + { + "epoch": 5.6916932907348246, + "grad_norm": 0.11851351708173752, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 7126 + }, + { + "epoch": 5.692492012779553, + "grad_norm": 0.0839112401008606, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7127 + }, + { + "epoch": 5.693290734824281, + "grad_norm": 0.07717803865671158, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7128 + }, + { + "epoch": 5.694089456869009, + "grad_norm": 0.10219333320856094, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7129 + }, + { + "epoch": 5.694888178913738, + "grad_norm": 0.06746016442775726, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7130 + }, + { + "epoch": 5.695686900958466, + "grad_norm": 0.09630785137414932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7131 + }, + { + "epoch": 5.696485623003195, + "grad_norm": 0.059845466166734695, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7132 + }, + { + "epoch": 5.697284345047923, + "grad_norm": 0.10587267577648163, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7133 + }, + { + "epoch": 5.698083067092652, + "grad_norm": 0.12221334874629974, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7134 + }, + { + "epoch": 5.69888178913738, + "grad_norm": 0.1638030856847763, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7135 + }, + { + "epoch": 5.699680511182109, + "grad_norm": 0.04686988145112991, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7136 + }, + { + "epoch": 5.700479233226837, + "grad_norm": 0.09120972454547882, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7137 + }, + { + "epoch": 5.701277955271565, + "grad_norm": 0.1081257089972496, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7138 + }, + { + "epoch": 5.702076677316294, + "grad_norm": 0.07313218712806702, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7139 + }, + { + "epoch": 5.702875399361022, + "grad_norm": 0.06039511039853096, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7140 + }, + { + "epoch": 5.703674121405751, + "grad_norm": 0.14473693072795868, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7141 + }, + { + "epoch": 5.704472843450479, + "grad_norm": 0.15062592923641205, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7142 + }, + { + "epoch": 5.705271565495208, + "grad_norm": 0.09711029380559921, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7143 + }, + { + "epoch": 5.706070287539936, + "grad_norm": 0.056874651461839676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7144 + }, + { + "epoch": 5.706869009584665, + "grad_norm": 0.1077205091714859, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7145 + }, + { + "epoch": 5.707667731629393, + "grad_norm": 0.1437366008758545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7146 + }, + { + "epoch": 5.708466453674122, + "grad_norm": 0.06206873059272766, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7147 + }, + { + "epoch": 5.7092651757188495, + "grad_norm": 0.06379563361406326, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7148 + }, + { + "epoch": 5.710063897763578, + "grad_norm": 0.11586727946996689, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7149 + }, + { + "epoch": 5.710862619808307, + "grad_norm": 0.12792269885540009, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7150 + }, + { + "epoch": 5.711661341853035, + "grad_norm": 0.08514344692230225, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7151 + }, + { + "epoch": 5.712460063897764, + "grad_norm": 0.045359376817941666, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7152 + }, + { + "epoch": 5.713258785942492, + "grad_norm": 0.13782942295074463, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7153 + }, + { + "epoch": 5.714057507987221, + "grad_norm": 0.1362733691930771, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7154 + }, + { + "epoch": 5.714856230031949, + "grad_norm": 0.11249929666519165, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7155 + }, + { + "epoch": 5.715654952076678, + "grad_norm": 0.07308060675859451, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7156 + }, + { + "epoch": 5.716453674121405, + "grad_norm": 0.08434231579303741, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7157 + }, + { + "epoch": 5.717252396166134, + "grad_norm": 0.0800870731472969, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7158 + }, + { + "epoch": 5.718051118210862, + "grad_norm": 0.09833595156669617, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7159 + }, + { + "epoch": 5.718849840255591, + "grad_norm": 0.06979871541261673, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7160 + }, + { + "epoch": 5.7196485623003195, + "grad_norm": 0.3326590657234192, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7161 + }, + { + "epoch": 5.720447284345048, + "grad_norm": 0.07953538745641708, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7162 + }, + { + "epoch": 5.7212460063897765, + "grad_norm": 0.06084589287638664, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7163 + }, + { + "epoch": 5.722044728434505, + "grad_norm": 0.05060078203678131, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7164 + }, + { + "epoch": 5.722843450479234, + "grad_norm": 0.11765584349632263, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7165 + }, + { + "epoch": 5.723642172523961, + "grad_norm": 0.11147762089967728, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7166 + }, + { + "epoch": 5.72444089456869, + "grad_norm": 0.051353756338357925, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7167 + }, + { + "epoch": 5.725239616613418, + "grad_norm": 0.06255709379911423, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7168 + }, + { + "epoch": 5.726038338658147, + "grad_norm": 0.048915427178144455, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7169 + }, + { + "epoch": 5.726837060702875, + "grad_norm": 0.057233601808547974, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7170 + }, + { + "epoch": 5.727635782747604, + "grad_norm": 0.0828251764178276, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7171 + }, + { + "epoch": 5.728434504792332, + "grad_norm": 0.07387874275445938, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7172 + }, + { + "epoch": 5.729233226837061, + "grad_norm": 0.04857983812689781, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7173 + }, + { + "epoch": 5.7300319488817895, + "grad_norm": 0.07202452421188354, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7174 + }, + { + "epoch": 5.730830670926517, + "grad_norm": 0.4291386306285858, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7175 + }, + { + "epoch": 5.731629392971246, + "grad_norm": 0.07219598442316055, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7176 + }, + { + "epoch": 5.732428115015974, + "grad_norm": 0.07889580726623535, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7177 + }, + { + "epoch": 5.733226837060703, + "grad_norm": 0.1154242753982544, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7178 + }, + { + "epoch": 5.734025559105431, + "grad_norm": 0.1711360067129135, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7179 + }, + { + "epoch": 5.73482428115016, + "grad_norm": 0.15897679328918457, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7180 + }, + { + "epoch": 5.735623003194888, + "grad_norm": 0.056718453764915466, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7181 + }, + { + "epoch": 5.736421725239617, + "grad_norm": 0.10130516439676285, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7182 + }, + { + "epoch": 5.737220447284345, + "grad_norm": 0.10965991020202637, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7183 + }, + { + "epoch": 5.738019169329074, + "grad_norm": 0.043925706297159195, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7184 + }, + { + "epoch": 5.738817891373802, + "grad_norm": 0.16040641069412231, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7185 + }, + { + "epoch": 5.73961661341853, + "grad_norm": 0.545796275138855, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7186 + }, + { + "epoch": 5.7404153354632586, + "grad_norm": 0.12285015732049942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7187 + }, + { + "epoch": 5.741214057507987, + "grad_norm": 0.1241980791091919, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7188 + }, + { + "epoch": 5.742012779552716, + "grad_norm": 0.18415005505084991, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7189 + }, + { + "epoch": 5.742811501597444, + "grad_norm": 0.1455639749765396, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7190 + }, + { + "epoch": 5.743610223642173, + "grad_norm": 0.05731341987848282, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7191 + }, + { + "epoch": 5.744408945686901, + "grad_norm": 0.10810694098472595, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7192 + }, + { + "epoch": 5.74520766773163, + "grad_norm": 0.13279423117637634, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7193 + }, + { + "epoch": 5.746006389776358, + "grad_norm": 0.048075832426548004, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7194 + }, + { + "epoch": 5.746805111821086, + "grad_norm": 0.07276510447263718, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7195 + }, + { + "epoch": 5.747603833865814, + "grad_norm": 0.0666821077466011, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7196 + }, + { + "epoch": 5.748402555910543, + "grad_norm": 0.0950300320982933, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7197 + }, + { + "epoch": 5.7492012779552715, + "grad_norm": 0.07229208946228027, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7198 + }, + { + "epoch": 5.75, + "grad_norm": 0.08129260689020157, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7199 + }, + { + "epoch": 5.7507987220447285, + "grad_norm": 0.08685708791017532, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7200 + }, + { + "epoch": 5.751597444089457, + "grad_norm": 0.048116523772478104, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7201 + }, + { + "epoch": 5.752396166134186, + "grad_norm": 0.08470416814088821, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7202 + }, + { + "epoch": 5.753194888178914, + "grad_norm": 0.09388689696788788, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7203 + }, + { + "epoch": 5.753993610223642, + "grad_norm": 0.07961093634366989, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7204 + }, + { + "epoch": 5.75479233226837, + "grad_norm": 0.05949364975094795, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7205 + }, + { + "epoch": 5.755591054313099, + "grad_norm": 0.10149726271629333, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7206 + }, + { + "epoch": 5.756389776357827, + "grad_norm": 0.30414992570877075, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7207 + }, + { + "epoch": 5.757188498402556, + "grad_norm": 0.06670042872428894, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7208 + }, + { + "epoch": 5.757987220447284, + "grad_norm": 0.061501920223236084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7209 + }, + { + "epoch": 5.758785942492013, + "grad_norm": 0.06627584993839264, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7210 + }, + { + "epoch": 5.7595846645367414, + "grad_norm": 0.1268157660961151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7211 + }, + { + "epoch": 5.76038338658147, + "grad_norm": 0.10253716260194778, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7212 + }, + { + "epoch": 5.761182108626198, + "grad_norm": 0.08384321630001068, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7213 + }, + { + "epoch": 5.761980830670926, + "grad_norm": 0.09078267216682434, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7214 + }, + { + "epoch": 5.762779552715655, + "grad_norm": 0.10487394034862518, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7215 + }, + { + "epoch": 5.763578274760383, + "grad_norm": 0.12192805856466293, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7216 + }, + { + "epoch": 5.764376996805112, + "grad_norm": 0.16597039997577667, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7217 + }, + { + "epoch": 5.76517571884984, + "grad_norm": 0.08498643338680267, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7218 + }, + { + "epoch": 5.765974440894569, + "grad_norm": 0.12794862687587738, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7219 + }, + { + "epoch": 5.766773162939297, + "grad_norm": 0.13595858216285706, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7220 + }, + { + "epoch": 5.767571884984026, + "grad_norm": 0.08182058483362198, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7221 + }, + { + "epoch": 5.768370607028754, + "grad_norm": 0.11747279763221741, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7222 + }, + { + "epoch": 5.769169329073483, + "grad_norm": 0.13400238752365112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7223 + }, + { + "epoch": 5.7699680511182105, + "grad_norm": 0.18527893722057343, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7224 + }, + { + "epoch": 5.770766773162939, + "grad_norm": 0.05130131170153618, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7225 + }, + { + "epoch": 5.771565495207668, + "grad_norm": 0.14139772951602936, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7226 + }, + { + "epoch": 5.772364217252396, + "grad_norm": 0.07901434600353241, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7227 + }, + { + "epoch": 5.773162939297125, + "grad_norm": 0.0642717182636261, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7228 + }, + { + "epoch": 5.773961661341853, + "grad_norm": 0.0693419873714447, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7229 + }, + { + "epoch": 5.774760383386582, + "grad_norm": 0.06490292400121689, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7230 + }, + { + "epoch": 5.77555910543131, + "grad_norm": 0.09405414760112762, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7231 + }, + { + "epoch": 5.776357827476039, + "grad_norm": 0.10439605265855789, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7232 + }, + { + "epoch": 5.777156549520766, + "grad_norm": 0.06811316311359406, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7233 + }, + { + "epoch": 5.777955271565495, + "grad_norm": 0.0707770362496376, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7234 + }, + { + "epoch": 5.7787539936102235, + "grad_norm": 0.08751409500837326, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7235 + }, + { + "epoch": 5.779552715654952, + "grad_norm": 0.09626015275716782, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7236 + }, + { + "epoch": 5.7803514376996805, + "grad_norm": 0.11487453430891037, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7237 + }, + { + "epoch": 5.781150159744409, + "grad_norm": 0.06278856843709946, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7238 + }, + { + "epoch": 5.781948881789138, + "grad_norm": 0.131802499294281, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7239 + }, + { + "epoch": 5.782747603833866, + "grad_norm": 0.09209976345300674, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7240 + }, + { + "epoch": 5.783546325878595, + "grad_norm": 0.06524617224931717, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7241 + }, + { + "epoch": 5.784345047923322, + "grad_norm": 0.10735169053077698, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7242 + }, + { + "epoch": 5.785143769968051, + "grad_norm": 0.08926022797822952, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7243 + }, + { + "epoch": 5.785942492012779, + "grad_norm": 0.08254969120025635, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7244 + }, + { + "epoch": 5.786741214057508, + "grad_norm": 0.07478158175945282, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7245 + }, + { + "epoch": 5.787539936102236, + "grad_norm": 0.0974164679646492, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7246 + }, + { + "epoch": 5.788338658146965, + "grad_norm": 0.05145352706313133, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7247 + }, + { + "epoch": 5.789137380191693, + "grad_norm": 0.11986715346574783, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7248 + }, + { + "epoch": 5.789936102236422, + "grad_norm": 0.12020506709814072, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7249 + }, + { + "epoch": 5.7907348242811505, + "grad_norm": 0.07199704647064209, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7250 + }, + { + "epoch": 5.791533546325878, + "grad_norm": 0.10702182352542877, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7251 + }, + { + "epoch": 5.792332268370607, + "grad_norm": 0.10817115753889084, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7252 + }, + { + "epoch": 5.793130990415335, + "grad_norm": 0.1875494122505188, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7253 + }, + { + "epoch": 5.793929712460064, + "grad_norm": 0.07347052544355392, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7254 + }, + { + "epoch": 5.794728434504792, + "grad_norm": 0.08588847517967224, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7255 + }, + { + "epoch": 5.795527156549521, + "grad_norm": 0.08241020143032074, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7256 + }, + { + "epoch": 5.796325878594249, + "grad_norm": 0.06322775781154633, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7257 + }, + { + "epoch": 5.797124600638978, + "grad_norm": 0.10279159247875214, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7258 + }, + { + "epoch": 5.797923322683706, + "grad_norm": 0.1887427717447281, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7259 + }, + { + "epoch": 5.798722044728435, + "grad_norm": 0.12288179248571396, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7260 + }, + { + "epoch": 5.799520766773163, + "grad_norm": 0.07014663517475128, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7261 + }, + { + "epoch": 5.800319488817891, + "grad_norm": 0.3741980493068695, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7262 + }, + { + "epoch": 5.80111821086262, + "grad_norm": 0.10083315521478653, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7263 + }, + { + "epoch": 5.801916932907348, + "grad_norm": 0.06427261233329773, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7264 + }, + { + "epoch": 5.802715654952077, + "grad_norm": 0.06265366077423096, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7265 + }, + { + "epoch": 5.803514376996805, + "grad_norm": 0.09602728486061096, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7266 + }, + { + "epoch": 5.804313099041534, + "grad_norm": 0.10369620472192764, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7267 + }, + { + "epoch": 5.805111821086262, + "grad_norm": 0.09742012619972229, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7268 + }, + { + "epoch": 5.805910543130991, + "grad_norm": 0.11579136550426483, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7269 + }, + { + "epoch": 5.806709265175719, + "grad_norm": 0.11265771090984344, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7270 + }, + { + "epoch": 5.807507987220447, + "grad_norm": 0.10684274882078171, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7271 + }, + { + "epoch": 5.8083067092651754, + "grad_norm": 0.12550850212574005, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7272 + }, + { + "epoch": 5.809105431309904, + "grad_norm": 0.04966668784618378, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7273 + }, + { + "epoch": 5.8099041533546325, + "grad_norm": 0.26124852895736694, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7274 + }, + { + "epoch": 5.810702875399361, + "grad_norm": 0.12293774634599686, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7275 + }, + { + "epoch": 5.81150159744409, + "grad_norm": 0.11183387041091919, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7276 + }, + { + "epoch": 5.812300319488818, + "grad_norm": 0.08738099783658981, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7277 + }, + { + "epoch": 5.813099041533547, + "grad_norm": 0.06429604440927505, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7278 + }, + { + "epoch": 5.813897763578275, + "grad_norm": 0.09102299064397812, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7279 + }, + { + "epoch": 5.814696485623003, + "grad_norm": 0.06249788776040077, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7280 + }, + { + "epoch": 5.815495207667731, + "grad_norm": 0.08752568066120148, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7281 + }, + { + "epoch": 5.81629392971246, + "grad_norm": 0.06289692968130112, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7282 + }, + { + "epoch": 5.817092651757188, + "grad_norm": 0.1269187480211258, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7283 + }, + { + "epoch": 5.817891373801917, + "grad_norm": 0.0839361846446991, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7284 + }, + { + "epoch": 5.818690095846645, + "grad_norm": 0.0855027437210083, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7285 + }, + { + "epoch": 5.819488817891374, + "grad_norm": 0.20559446513652802, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7286 + }, + { + "epoch": 5.8202875399361025, + "grad_norm": 0.0740990862250328, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7287 + }, + { + "epoch": 5.821086261980831, + "grad_norm": 0.06762924790382385, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7288 + }, + { + "epoch": 5.821884984025559, + "grad_norm": 0.5238296985626221, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7289 + }, + { + "epoch": 5.822683706070287, + "grad_norm": 0.09929470717906952, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7290 + }, + { + "epoch": 5.823482428115016, + "grad_norm": 0.11528550088405609, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7291 + }, + { + "epoch": 5.824281150159744, + "grad_norm": 0.10563576966524124, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 7292 + }, + { + "epoch": 5.825079872204473, + "grad_norm": 0.13924843072891235, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7293 + }, + { + "epoch": 5.825878594249201, + "grad_norm": 0.1332271546125412, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7294 + }, + { + "epoch": 5.82667731629393, + "grad_norm": 0.15709803998470306, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7295 + }, + { + "epoch": 5.827476038338658, + "grad_norm": 0.19638708233833313, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7296 + }, + { + "epoch": 5.828274760383387, + "grad_norm": 0.16845624148845673, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7297 + }, + { + "epoch": 5.8290734824281145, + "grad_norm": 0.15753695368766785, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7298 + }, + { + "epoch": 5.829872204472844, + "grad_norm": 0.04734346270561218, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7299 + }, + { + "epoch": 5.830670926517572, + "grad_norm": 0.48153460025787354, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7300 + }, + { + "epoch": 5.8314696485623, + "grad_norm": 0.09118880331516266, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7301 + }, + { + "epoch": 5.832268370607029, + "grad_norm": 0.10301438719034195, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7302 + }, + { + "epoch": 5.833067092651757, + "grad_norm": 0.12838974595069885, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7303 + }, + { + "epoch": 5.833865814696486, + "grad_norm": 0.1537700593471527, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7304 + }, + { + "epoch": 5.834664536741214, + "grad_norm": 0.08763979375362396, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7305 + }, + { + "epoch": 5.835463258785943, + "grad_norm": 0.2613058388233185, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7306 + }, + { + "epoch": 5.836261980830671, + "grad_norm": 0.13767825067043304, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7307 + }, + { + "epoch": 5.8370607028754, + "grad_norm": 0.14907905459403992, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7308 + }, + { + "epoch": 5.837859424920127, + "grad_norm": 0.3314233124256134, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 7309 + }, + { + "epoch": 5.838658146964856, + "grad_norm": 0.1368636041879654, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7310 + }, + { + "epoch": 5.8394568690095845, + "grad_norm": 0.13423767685890198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7311 + }, + { + "epoch": 5.840255591054313, + "grad_norm": 0.08914478868246078, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7312 + }, + { + "epoch": 5.8410543130990416, + "grad_norm": 0.09363356977701187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7313 + }, + { + "epoch": 5.84185303514377, + "grad_norm": 0.226780965924263, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7314 + }, + { + "epoch": 5.842651757188499, + "grad_norm": 0.09002092480659485, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7315 + }, + { + "epoch": 5.843450479233227, + "grad_norm": 0.06387127935886383, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7316 + }, + { + "epoch": 5.844249201277956, + "grad_norm": 0.1643945276737213, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7317 + }, + { + "epoch": 5.845047923322683, + "grad_norm": 0.13561291992664337, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7318 + }, + { + "epoch": 5.845846645367412, + "grad_norm": 0.14334949851036072, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7319 + }, + { + "epoch": 5.84664536741214, + "grad_norm": 0.13982698321342468, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7320 + }, + { + "epoch": 5.847444089456869, + "grad_norm": 0.10822772979736328, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7321 + }, + { + "epoch": 5.848242811501597, + "grad_norm": 0.07073087245225906, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7322 + }, + { + "epoch": 5.849041533546326, + "grad_norm": 0.09560684859752655, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7323 + }, + { + "epoch": 5.8498402555910545, + "grad_norm": 0.0882779061794281, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7324 + }, + { + "epoch": 5.850638977635783, + "grad_norm": 0.17319771647453308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7325 + }, + { + "epoch": 5.8514376996805115, + "grad_norm": 0.12140306830406189, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7326 + }, + { + "epoch": 5.852236421725239, + "grad_norm": 0.12064560502767563, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7327 + }, + { + "epoch": 5.853035143769968, + "grad_norm": 0.0733642578125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7328 + }, + { + "epoch": 5.853833865814696, + "grad_norm": 0.08563291281461716, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7329 + }, + { + "epoch": 5.854632587859425, + "grad_norm": 0.11337493360042572, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7330 + }, + { + "epoch": 5.855431309904153, + "grad_norm": 0.12164553254842758, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7331 + }, + { + "epoch": 5.856230031948882, + "grad_norm": 0.06406484544277191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7332 + }, + { + "epoch": 5.85702875399361, + "grad_norm": 0.0765780508518219, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7333 + }, + { + "epoch": 5.857827476038339, + "grad_norm": 0.12847815454006195, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7334 + }, + { + "epoch": 5.858626198083067, + "grad_norm": 0.11934550106525421, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7335 + }, + { + "epoch": 5.859424920127795, + "grad_norm": 0.08170188963413239, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7336 + }, + { + "epoch": 5.8602236421725244, + "grad_norm": 0.13636507093906403, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7337 + }, + { + "epoch": 5.861022364217252, + "grad_norm": 0.11030741780996323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7338 + }, + { + "epoch": 5.861821086261981, + "grad_norm": 0.10200777649879456, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7339 + }, + { + "epoch": 5.862619808306709, + "grad_norm": 0.09916897118091583, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7340 + }, + { + "epoch": 5.863418530351438, + "grad_norm": 0.08136509358882904, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7341 + }, + { + "epoch": 5.864217252396166, + "grad_norm": 0.051609545946121216, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7342 + }, + { + "epoch": 5.865015974440895, + "grad_norm": 0.061890844255685806, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7343 + }, + { + "epoch": 5.865814696485623, + "grad_norm": 0.10308966040611267, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7344 + }, + { + "epoch": 5.866613418530352, + "grad_norm": 0.06762709468603134, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7345 + }, + { + "epoch": 5.86741214057508, + "grad_norm": 0.07767036557197571, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7346 + }, + { + "epoch": 5.868210862619808, + "grad_norm": 0.10608458518981934, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7347 + }, + { + "epoch": 5.8690095846645365, + "grad_norm": 0.13812315464019775, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7348 + }, + { + "epoch": 5.869808306709265, + "grad_norm": 0.10485442727804184, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7349 + }, + { + "epoch": 5.8706070287539935, + "grad_norm": 0.08510198444128036, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7350 + }, + { + "epoch": 5.871405750798722, + "grad_norm": 0.17235122621059418, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7351 + }, + { + "epoch": 5.872204472843451, + "grad_norm": 0.057075515389442444, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7352 + }, + { + "epoch": 5.873003194888179, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7353 + }, + { + "epoch": 5.873801916932908, + "grad_norm": 0.1859748661518097, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7354 + }, + { + "epoch": 5.874600638977636, + "grad_norm": 0.2350156307220459, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7355 + }, + { + "epoch": 5.875399361022364, + "grad_norm": 0.11264859884977341, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 7356 + }, + { + "epoch": 5.876198083067092, + "grad_norm": 0.2859210968017578, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7357 + }, + { + "epoch": 5.876996805111821, + "grad_norm": 0.08706829696893692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7358 + }, + { + "epoch": 5.877795527156549, + "grad_norm": 0.0644318088889122, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7359 + }, + { + "epoch": 5.878594249201278, + "grad_norm": 0.10985474288463593, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7360 + }, + { + "epoch": 5.8793929712460065, + "grad_norm": 0.09968867897987366, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7361 + }, + { + "epoch": 5.880191693290735, + "grad_norm": 0.07277355343103409, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7362 + }, + { + "epoch": 5.8809904153354635, + "grad_norm": 0.043085962533950806, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7363 + }, + { + "epoch": 5.881789137380192, + "grad_norm": 0.10392415523529053, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7364 + }, + { + "epoch": 5.88258785942492, + "grad_norm": 0.05523041635751724, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7365 + }, + { + "epoch": 5.883386581469648, + "grad_norm": 0.1754276603460312, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7366 + }, + { + "epoch": 5.884185303514377, + "grad_norm": 0.09561391174793243, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7367 + }, + { + "epoch": 5.884984025559105, + "grad_norm": 0.17572976648807526, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7368 + }, + { + "epoch": 5.885782747603834, + "grad_norm": 0.06476190686225891, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7369 + }, + { + "epoch": 5.886581469648562, + "grad_norm": 0.08763223886489868, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7370 + }, + { + "epoch": 5.887380191693291, + "grad_norm": 0.04419226944446564, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7371 + }, + { + "epoch": 5.888178913738019, + "grad_norm": 0.08707522600889206, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7372 + }, + { + "epoch": 5.888977635782748, + "grad_norm": 0.3117498457431793, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7373 + }, + { + "epoch": 5.8897763578274756, + "grad_norm": 0.04153338074684143, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7374 + }, + { + "epoch": 5.890575079872205, + "grad_norm": 0.10575849562883377, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7375 + }, + { + "epoch": 5.891373801916933, + "grad_norm": 0.07147886604070663, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7376 + }, + { + "epoch": 5.892172523961661, + "grad_norm": 0.05394810438156128, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7377 + }, + { + "epoch": 5.89297124600639, + "grad_norm": 0.15453197062015533, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7378 + }, + { + "epoch": 5.893769968051118, + "grad_norm": 0.19460639357566833, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7379 + }, + { + "epoch": 5.894568690095847, + "grad_norm": 0.13046157360076904, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7380 + }, + { + "epoch": 5.895367412140575, + "grad_norm": 0.09074800461530685, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7381 + }, + { + "epoch": 5.896166134185304, + "grad_norm": 0.09315948188304901, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7382 + }, + { + "epoch": 5.896964856230032, + "grad_norm": 0.0572352297604084, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7383 + }, + { + "epoch": 5.897763578274761, + "grad_norm": 0.09366700798273087, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7384 + }, + { + "epoch": 5.8985623003194885, + "grad_norm": 0.12643125653266907, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7385 + }, + { + "epoch": 5.899361022364217, + "grad_norm": 0.14831441640853882, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7386 + }, + { + "epoch": 5.9001597444089455, + "grad_norm": 0.06892798840999603, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7387 + }, + { + "epoch": 5.900958466453674, + "grad_norm": 0.24058189988136292, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7388 + }, + { + "epoch": 5.901757188498403, + "grad_norm": 0.12589944899082184, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7389 + }, + { + "epoch": 5.902555910543131, + "grad_norm": 0.10197508335113525, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7390 + }, + { + "epoch": 5.90335463258786, + "grad_norm": 0.04367182031273842, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7391 + }, + { + "epoch": 5.904153354632588, + "grad_norm": 0.11131702363491058, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7392 + }, + { + "epoch": 5.904952076677317, + "grad_norm": 0.10258752107620239, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7393 + }, + { + "epoch": 5.905750798722044, + "grad_norm": 0.05077935755252838, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7394 + }, + { + "epoch": 5.906549520766773, + "grad_norm": 0.13514964282512665, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7395 + }, + { + "epoch": 5.907348242811501, + "grad_norm": 0.365681916475296, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7396 + }, + { + "epoch": 5.90814696485623, + "grad_norm": 0.09199032932519913, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7397 + }, + { + "epoch": 5.9089456869009584, + "grad_norm": 0.10341943800449371, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7398 + }, + { + "epoch": 5.909744408945687, + "grad_norm": 0.05396822467446327, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7399 + }, + { + "epoch": 5.9105431309904155, + "grad_norm": 0.06582850217819214, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 7400 + }, + { + "epoch": 5.911341853035144, + "grad_norm": 0.04932714253664017, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7401 + }, + { + "epoch": 5.912140575079873, + "grad_norm": 0.08820181339979172, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7402 + }, + { + "epoch": 5.9129392971246, + "grad_norm": 0.08759067952632904, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7403 + }, + { + "epoch": 5.913738019169329, + "grad_norm": 0.0582246370613575, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7404 + }, + { + "epoch": 5.914536741214057, + "grad_norm": 0.3632248044013977, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7405 + }, + { + "epoch": 5.915335463258786, + "grad_norm": 0.054485730826854706, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7406 + }, + { + "epoch": 5.916134185303514, + "grad_norm": 0.06776587665081024, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7407 + }, + { + "epoch": 5.916932907348243, + "grad_norm": 0.06876091659069061, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7408 + }, + { + "epoch": 5.917731629392971, + "grad_norm": 0.06507224589586258, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7409 + }, + { + "epoch": 5.9185303514377, + "grad_norm": 1.061123013496399, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7410 + }, + { + "epoch": 5.919329073482428, + "grad_norm": 0.2808170020580292, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7411 + }, + { + "epoch": 5.920127795527156, + "grad_norm": 0.2075907289981842, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7412 + }, + { + "epoch": 5.9209265175718855, + "grad_norm": 0.08707362413406372, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7413 + }, + { + "epoch": 5.921725239616613, + "grad_norm": 0.17357248067855835, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 7414 + }, + { + "epoch": 5.922523961661342, + "grad_norm": 0.19713328778743744, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7415 + }, + { + "epoch": 5.92332268370607, + "grad_norm": 0.10456258803606033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7416 + }, + { + "epoch": 5.924121405750799, + "grad_norm": 0.10678638517856598, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7417 + }, + { + "epoch": 5.924920127795527, + "grad_norm": 0.12577000260353088, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7418 + }, + { + "epoch": 5.925718849840256, + "grad_norm": 0.14730660617351532, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7419 + }, + { + "epoch": 5.926517571884984, + "grad_norm": 0.07055118680000305, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7420 + }, + { + "epoch": 5.927316293929713, + "grad_norm": 0.10249259322881699, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7421 + }, + { + "epoch": 5.928115015974441, + "grad_norm": 0.06859050691127777, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7422 + }, + { + "epoch": 5.928913738019169, + "grad_norm": 0.043517664074897766, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7423 + }, + { + "epoch": 5.9297124600638975, + "grad_norm": 0.06680947542190552, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7424 + }, + { + "epoch": 5.930511182108626, + "grad_norm": 0.07522429525852203, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7425 + }, + { + "epoch": 5.931309904153355, + "grad_norm": 0.15828543901443481, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7426 + }, + { + "epoch": 5.932108626198083, + "grad_norm": 0.19134600460529327, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7427 + }, + { + "epoch": 5.932907348242812, + "grad_norm": 0.12455222010612488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7428 + }, + { + "epoch": 5.93370607028754, + "grad_norm": 0.11147905886173248, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7429 + }, + { + "epoch": 5.934504792332269, + "grad_norm": 0.1238674744963646, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7430 + }, + { + "epoch": 5.935303514376997, + "grad_norm": 0.15700307488441467, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7431 + }, + { + "epoch": 5.936102236421725, + "grad_norm": 0.11487080156803131, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7432 + }, + { + "epoch": 5.936900958466453, + "grad_norm": 0.11961077898740768, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7433 + }, + { + "epoch": 5.937699680511182, + "grad_norm": 0.07594173401594162, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7434 + }, + { + "epoch": 5.93849840255591, + "grad_norm": 0.19439400732517242, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7435 + }, + { + "epoch": 5.939297124600639, + "grad_norm": 0.17745599150657654, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7436 + }, + { + "epoch": 5.9400958466453675, + "grad_norm": 0.15732692182064056, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7437 + }, + { + "epoch": 5.940894568690096, + "grad_norm": 0.08824916929006577, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7438 + }, + { + "epoch": 5.9416932907348246, + "grad_norm": 0.12354888767004013, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7439 + }, + { + "epoch": 5.942492012779553, + "grad_norm": 0.10940376669168472, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7440 + }, + { + "epoch": 5.943290734824281, + "grad_norm": 0.05808279290795326, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7441 + }, + { + "epoch": 5.944089456869009, + "grad_norm": 0.19519653916358948, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 7442 + }, + { + "epoch": 5.944888178913738, + "grad_norm": 0.07913058996200562, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7443 + }, + { + "epoch": 5.945686900958466, + "grad_norm": 0.5150377750396729, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7444 + }, + { + "epoch": 5.946485623003195, + "grad_norm": 0.24083790183067322, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7445 + }, + { + "epoch": 5.947284345047923, + "grad_norm": 0.11291394382715225, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7446 + }, + { + "epoch": 5.948083067092652, + "grad_norm": 0.0899023786187172, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7447 + }, + { + "epoch": 5.94888178913738, + "grad_norm": 0.05489958077669144, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7448 + }, + { + "epoch": 5.949680511182109, + "grad_norm": 0.12375161051750183, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7449 + }, + { + "epoch": 5.950479233226837, + "grad_norm": 0.11610512435436249, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7450 + }, + { + "epoch": 5.951277955271565, + "grad_norm": 0.06953240931034088, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7451 + }, + { + "epoch": 5.952076677316294, + "grad_norm": 0.09784717857837677, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7452 + }, + { + "epoch": 5.952875399361022, + "grad_norm": 0.059533409774303436, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7453 + }, + { + "epoch": 5.953674121405751, + "grad_norm": 0.06361017376184464, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7454 + }, + { + "epoch": 5.954472843450479, + "grad_norm": 0.33739587664604187, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7455 + }, + { + "epoch": 5.955271565495208, + "grad_norm": 0.0726039931178093, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7456 + }, + { + "epoch": 5.956070287539936, + "grad_norm": 0.047813788056373596, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7457 + }, + { + "epoch": 5.956869009584665, + "grad_norm": 0.05501490831375122, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7458 + }, + { + "epoch": 5.957667731629393, + "grad_norm": 0.24806374311447144, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7459 + }, + { + "epoch": 5.958466453674122, + "grad_norm": 0.09020408987998962, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7460 + }, + { + "epoch": 5.9592651757188495, + "grad_norm": 0.09845588356256485, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7461 + }, + { + "epoch": 5.960063897763578, + "grad_norm": 0.2733388841152191, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7462 + }, + { + "epoch": 5.960862619808307, + "grad_norm": 0.04368302598595619, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7463 + }, + { + "epoch": 5.961661341853035, + "grad_norm": 0.06559797376394272, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7464 + }, + { + "epoch": 5.962460063897764, + "grad_norm": 0.08194267004728317, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7465 + }, + { + "epoch": 5.963258785942492, + "grad_norm": 0.08440488576889038, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7466 + }, + { + "epoch": 5.964057507987221, + "grad_norm": 0.07046753168106079, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7467 + }, + { + "epoch": 5.964856230031949, + "grad_norm": 0.061910174787044525, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7468 + }, + { + "epoch": 5.965654952076678, + "grad_norm": 0.06781110167503357, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7469 + }, + { + "epoch": 5.966453674121405, + "grad_norm": 0.0626576617360115, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7470 + }, + { + "epoch": 5.967252396166134, + "grad_norm": 0.05339542031288147, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7471 + }, + { + "epoch": 5.968051118210862, + "grad_norm": 0.09167633950710297, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7472 + }, + { + "epoch": 5.968849840255591, + "grad_norm": 0.07272132486104965, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7473 + }, + { + "epoch": 5.9696485623003195, + "grad_norm": 0.1218709796667099, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7474 + }, + { + "epoch": 5.970447284345048, + "grad_norm": 0.21024082601070404, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7475 + }, + { + "epoch": 5.9712460063897765, + "grad_norm": 0.08869504183530807, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7476 + }, + { + "epoch": 5.972044728434505, + "grad_norm": 0.05930836871266365, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7477 + }, + { + "epoch": 5.972843450479234, + "grad_norm": 0.10009569674730301, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7478 + }, + { + "epoch": 5.973642172523961, + "grad_norm": 0.2543089687824249, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7479 + }, + { + "epoch": 5.97444089456869, + "grad_norm": 0.04702993854880333, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7480 + }, + { + "epoch": 5.975239616613418, + "grad_norm": 0.12841154634952545, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7481 + }, + { + "epoch": 5.976038338658147, + "grad_norm": 0.10137920081615448, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7482 + }, + { + "epoch": 5.976837060702875, + "grad_norm": 0.0582512766122818, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7483 + }, + { + "epoch": 5.977635782747604, + "grad_norm": 0.06556501984596252, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7484 + }, + { + "epoch": 5.978434504792332, + "grad_norm": 0.2065235674381256, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7485 + }, + { + "epoch": 5.979233226837061, + "grad_norm": 0.07943716645240784, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7486 + }, + { + "epoch": 5.9800319488817895, + "grad_norm": 0.05257594957947731, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7487 + }, + { + "epoch": 5.980830670926517, + "grad_norm": 0.06949680298566818, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7488 + }, + { + "epoch": 5.981629392971246, + "grad_norm": 0.0967894196510315, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7489 + }, + { + "epoch": 5.982428115015974, + "grad_norm": 1.068231463432312, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7490 + }, + { + "epoch": 5.983226837060703, + "grad_norm": 0.0648348405957222, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7491 + }, + { + "epoch": 5.984025559105431, + "grad_norm": 0.2540450096130371, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7492 + }, + { + "epoch": 5.98482428115016, + "grad_norm": 0.1624346375465393, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7493 + }, + { + "epoch": 5.985623003194888, + "grad_norm": 0.10054703056812286, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7494 + }, + { + "epoch": 5.986421725239617, + "grad_norm": 0.05147058889269829, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7495 + }, + { + "epoch": 5.987220447284345, + "grad_norm": 0.10036633163690567, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7496 + }, + { + "epoch": 5.988019169329074, + "grad_norm": 0.14611777663230896, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7497 + }, + { + "epoch": 5.988817891373802, + "grad_norm": 0.12323570251464844, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7498 + }, + { + "epoch": 5.98961661341853, + "grad_norm": 0.04539888724684715, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7499 + }, + { + "epoch": 5.9904153354632586, + "grad_norm": 0.14555387198925018, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7500 + }, + { + "epoch": 5.991214057507987, + "grad_norm": 0.3205990195274353, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7501 + }, + { + "epoch": 5.992012779552716, + "grad_norm": 0.22900770604610443, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7502 + }, + { + "epoch": 5.992811501597444, + "grad_norm": 0.11138728260993958, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7503 + }, + { + "epoch": 5.993610223642173, + "grad_norm": 0.09425637125968933, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7504 + }, + { + "epoch": 5.994408945686901, + "grad_norm": 0.18409870564937592, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7505 + }, + { + "epoch": 5.99520766773163, + "grad_norm": 0.1610010713338852, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7506 + }, + { + "epoch": 5.996006389776358, + "grad_norm": 0.2304852306842804, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7507 + }, + { + "epoch": 5.996805111821086, + "grad_norm": 0.09830645471811295, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7508 + }, + { + "epoch": 5.997603833865814, + "grad_norm": 0.12319398671388626, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7509 + }, + { + "epoch": 5.998402555910543, + "grad_norm": 0.07925699651241302, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7510 + }, + { + "epoch": 5.9992012779552715, + "grad_norm": 0.07079242914915085, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7511 + }, + { + "epoch": 6.0, + "grad_norm": 0.14047275483608246, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7512 + }, + { + "epoch": 6.0007987220447285, + "grad_norm": 0.172583669424057, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7513 + }, + { + "epoch": 6.001597444089457, + "grad_norm": 0.3635086119174957, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7514 + }, + { + "epoch": 6.002396166134186, + "grad_norm": 0.14463695883750916, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7515 + }, + { + "epoch": 6.003194888178914, + "grad_norm": 0.24417585134506226, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7516 + }, + { + "epoch": 6.003993610223642, + "grad_norm": 0.25690382719039917, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7517 + }, + { + "epoch": 6.00479233226837, + "grad_norm": 0.12535394728183746, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 7518 + }, + { + "epoch": 6.005591054313099, + "grad_norm": 0.19279715418815613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7519 + }, + { + "epoch": 6.006389776357827, + "grad_norm": 0.10537917166948318, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7520 + }, + { + "epoch": 6.007188498402556, + "grad_norm": 0.07752633094787598, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7521 + }, + { + "epoch": 6.007987220447284, + "grad_norm": 0.10693971067667007, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7522 + }, + { + "epoch": 6.008785942492013, + "grad_norm": 0.06399057805538177, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7523 + }, + { + "epoch": 6.0095846645367414, + "grad_norm": 0.12577609717845917, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7524 + }, + { + "epoch": 6.01038338658147, + "grad_norm": 0.12770701944828033, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7525 + }, + { + "epoch": 6.0111821086261985, + "grad_norm": 0.07679085433483124, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7526 + }, + { + "epoch": 6.011980830670926, + "grad_norm": 0.14353524148464203, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7527 + }, + { + "epoch": 6.012779552715655, + "grad_norm": 0.3428184688091278, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7528 + }, + { + "epoch": 6.013578274760383, + "grad_norm": 0.1436242014169693, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 7529 + }, + { + "epoch": 6.014376996805112, + "grad_norm": 0.07608507573604584, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7530 + }, + { + "epoch": 6.01517571884984, + "grad_norm": 0.10932086408138275, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7531 + }, + { + "epoch": 6.015974440894569, + "grad_norm": 0.07631878554821014, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7532 + }, + { + "epoch": 6.016773162939297, + "grad_norm": 0.0718175396323204, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7533 + }, + { + "epoch": 6.017571884984026, + "grad_norm": 0.07661164551973343, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7534 + }, + { + "epoch": 6.018370607028754, + "grad_norm": 0.10753245651721954, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7535 + }, + { + "epoch": 6.019169329073482, + "grad_norm": 0.12740729749202728, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7536 + }, + { + "epoch": 6.0199680511182105, + "grad_norm": 0.14345388114452362, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7537 + }, + { + "epoch": 6.020766773162939, + "grad_norm": 0.13860031962394714, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7538 + }, + { + "epoch": 6.021565495207668, + "grad_norm": 0.07766555994749069, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7539 + }, + { + "epoch": 6.022364217252396, + "grad_norm": 0.11253347247838974, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7540 + }, + { + "epoch": 6.023162939297125, + "grad_norm": 0.18870452046394348, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7541 + }, + { + "epoch": 6.023961661341853, + "grad_norm": 0.12401654571294785, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7542 + }, + { + "epoch": 6.024760383386582, + "grad_norm": 0.08025321364402771, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7543 + }, + { + "epoch": 6.02555910543131, + "grad_norm": 0.12504157423973083, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7544 + }, + { + "epoch": 6.026357827476039, + "grad_norm": 0.07099851220846176, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7545 + }, + { + "epoch": 6.027156549520766, + "grad_norm": 0.09573683142662048, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7546 + }, + { + "epoch": 6.027955271565495, + "grad_norm": 0.18280553817749023, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7547 + }, + { + "epoch": 6.0287539936102235, + "grad_norm": 0.15688058733940125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7548 + }, + { + "epoch": 6.029552715654952, + "grad_norm": 0.11738436669111252, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7549 + }, + { + "epoch": 6.0303514376996805, + "grad_norm": 1.275103211402893, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7550 + }, + { + "epoch": 6.031150159744409, + "grad_norm": 0.39542102813720703, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7551 + }, + { + "epoch": 6.031948881789138, + "grad_norm": 0.32140371203422546, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7552 + }, + { + "epoch": 6.032747603833866, + "grad_norm": 0.2855371832847595, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7553 + }, + { + "epoch": 6.033546325878595, + "grad_norm": 0.14987513422966003, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7554 + }, + { + "epoch": 6.034345047923322, + "grad_norm": 0.25978198647499084, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7555 + }, + { + "epoch": 6.035143769968051, + "grad_norm": 0.14043942093849182, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7556 + }, + { + "epoch": 6.035942492012779, + "grad_norm": 0.16670344769954681, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7557 + }, + { + "epoch": 6.036741214057508, + "grad_norm": 0.1668681800365448, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7558 + }, + { + "epoch": 6.037539936102236, + "grad_norm": 0.11135906726121902, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7559 + }, + { + "epoch": 6.038338658146965, + "grad_norm": 0.26222026348114014, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7560 + }, + { + "epoch": 6.039137380191693, + "grad_norm": 0.1670113205909729, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7561 + }, + { + "epoch": 6.039936102236422, + "grad_norm": 0.15860766172409058, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7562 + }, + { + "epoch": 6.0407348242811505, + "grad_norm": 0.2577793300151825, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7563 + }, + { + "epoch": 6.041533546325879, + "grad_norm": 0.11147591471672058, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7564 + }, + { + "epoch": 6.042332268370607, + "grad_norm": 0.18452385067939758, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7565 + }, + { + "epoch": 6.043130990415335, + "grad_norm": 0.19697625935077667, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7566 + }, + { + "epoch": 6.043929712460064, + "grad_norm": 0.08586452901363373, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7567 + }, + { + "epoch": 6.044728434504792, + "grad_norm": 0.18721693754196167, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7568 + }, + { + "epoch": 6.045527156549521, + "grad_norm": 0.13190758228302002, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7569 + }, + { + "epoch": 6.046325878594249, + "grad_norm": 0.09424075484275818, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7570 + }, + { + "epoch": 6.047124600638978, + "grad_norm": 0.15252210199832916, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7571 + }, + { + "epoch": 6.047923322683706, + "grad_norm": 0.06378420442342758, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7572 + }, + { + "epoch": 6.048722044728435, + "grad_norm": 0.07665325701236725, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7573 + }, + { + "epoch": 6.0495207667731625, + "grad_norm": 0.0847245529294014, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7574 + }, + { + "epoch": 6.050319488817891, + "grad_norm": 0.034070566296577454, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7575 + }, + { + "epoch": 6.05111821086262, + "grad_norm": 0.08149915188550949, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7576 + }, + { + "epoch": 6.051916932907348, + "grad_norm": 0.07882412523031235, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7577 + }, + { + "epoch": 6.052715654952077, + "grad_norm": 0.055492956191301346, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7578 + }, + { + "epoch": 6.053514376996805, + "grad_norm": 0.10246025770902634, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7579 + }, + { + "epoch": 6.054313099041534, + "grad_norm": 0.11067861318588257, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7580 + }, + { + "epoch": 6.055111821086262, + "grad_norm": 0.06063758581876755, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7581 + }, + { + "epoch": 6.055910543130991, + "grad_norm": 0.06848330795764923, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 7582 + }, + { + "epoch": 6.056709265175719, + "grad_norm": 0.10336993634700775, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7583 + }, + { + "epoch": 6.057507987220447, + "grad_norm": 0.06081530824303627, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7584 + }, + { + "epoch": 6.0583067092651754, + "grad_norm": 0.08049804717302322, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7585 + }, + { + "epoch": 6.059105431309904, + "grad_norm": 0.09174875915050507, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7586 + }, + { + "epoch": 6.0599041533546325, + "grad_norm": 0.06121581420302391, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7587 + }, + { + "epoch": 6.060702875399361, + "grad_norm": 0.10653077065944672, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7588 + }, + { + "epoch": 6.06150159744409, + "grad_norm": 0.0676097571849823, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7589 + }, + { + "epoch": 6.062300319488818, + "grad_norm": 0.0625678300857544, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7590 + }, + { + "epoch": 6.063099041533547, + "grad_norm": 0.07936695963144302, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7591 + }, + { + "epoch": 6.063897763578275, + "grad_norm": 0.06149541214108467, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7592 + }, + { + "epoch": 6.064696485623003, + "grad_norm": 0.04549092426896095, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7593 + }, + { + "epoch": 6.065495207667731, + "grad_norm": 0.06483953446149826, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7594 + }, + { + "epoch": 6.06629392971246, + "grad_norm": 0.04048188030719757, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7595 + }, + { + "epoch": 6.067092651757188, + "grad_norm": 0.038281429558992386, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7596 + }, + { + "epoch": 6.067891373801917, + "grad_norm": 0.06686673313379288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7597 + }, + { + "epoch": 6.068690095846645, + "grad_norm": 0.09025852382183075, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7598 + }, + { + "epoch": 6.069488817891374, + "grad_norm": 0.07517793774604797, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7599 + }, + { + "epoch": 6.0702875399361025, + "grad_norm": 0.06342573463916779, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7600 + }, + { + "epoch": 6.071086261980831, + "grad_norm": 0.08630760759115219, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7601 + }, + { + "epoch": 6.0718849840255595, + "grad_norm": 0.06443625688552856, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7602 + }, + { + "epoch": 6.072683706070287, + "grad_norm": 0.08748311549425125, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7603 + }, + { + "epoch": 6.073482428115016, + "grad_norm": 0.051623452454805374, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7604 + }, + { + "epoch": 6.074281150159744, + "grad_norm": 0.09098891913890839, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7605 + }, + { + "epoch": 6.075079872204473, + "grad_norm": 0.14741428196430206, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7606 + }, + { + "epoch": 6.075878594249201, + "grad_norm": 0.064545176923275, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7607 + }, + { + "epoch": 6.07667731629393, + "grad_norm": 0.09775100648403168, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7608 + }, + { + "epoch": 6.077476038338658, + "grad_norm": 0.14192643761634827, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7609 + }, + { + "epoch": 6.078274760383387, + "grad_norm": 0.05390379950404167, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7610 + }, + { + "epoch": 6.079073482428115, + "grad_norm": 0.35628536343574524, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7611 + }, + { + "epoch": 6.079872204472843, + "grad_norm": 0.11727920919656754, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7612 + }, + { + "epoch": 6.080670926517572, + "grad_norm": 0.053165338933467865, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7613 + }, + { + "epoch": 6.0814696485623, + "grad_norm": 0.12718519568443298, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7614 + }, + { + "epoch": 6.082268370607029, + "grad_norm": 0.12406741827726364, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7615 + }, + { + "epoch": 6.083067092651757, + "grad_norm": 0.05323740839958191, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7616 + }, + { + "epoch": 6.083865814696486, + "grad_norm": 0.09811960160732269, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7617 + }, + { + "epoch": 6.084664536741214, + "grad_norm": 0.12453506886959076, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7618 + }, + { + "epoch": 6.085463258785943, + "grad_norm": 0.13459496200084686, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7619 + }, + { + "epoch": 6.086261980830671, + "grad_norm": 0.20130378007888794, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7620 + }, + { + "epoch": 6.0870607028754, + "grad_norm": 0.11361974477767944, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7621 + }, + { + "epoch": 6.087859424920127, + "grad_norm": 0.07432135194540024, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7622 + }, + { + "epoch": 6.088658146964856, + "grad_norm": 0.14522314071655273, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7623 + }, + { + "epoch": 6.0894568690095845, + "grad_norm": 0.050937261432409286, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7624 + }, + { + "epoch": 6.090255591054313, + "grad_norm": 0.12386021763086319, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7625 + }, + { + "epoch": 6.0910543130990416, + "grad_norm": 0.1498231738805771, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7626 + }, + { + "epoch": 6.09185303514377, + "grad_norm": 0.042041294276714325, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7627 + }, + { + "epoch": 6.092651757188499, + "grad_norm": 0.1103961393237114, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7628 + }, + { + "epoch": 6.093450479233227, + "grad_norm": 0.12362606078386307, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7629 + }, + { + "epoch": 6.094249201277956, + "grad_norm": 0.07069346308708191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7630 + }, + { + "epoch": 6.095047923322683, + "grad_norm": 0.1306593418121338, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7631 + }, + { + "epoch": 6.095846645367412, + "grad_norm": 0.11293961852788925, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7632 + }, + { + "epoch": 6.09664536741214, + "grad_norm": 0.07145176827907562, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7633 + }, + { + "epoch": 6.097444089456869, + "grad_norm": 0.11122562736272812, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7634 + }, + { + "epoch": 6.098242811501597, + "grad_norm": 0.039713576436042786, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7635 + }, + { + "epoch": 6.099041533546326, + "grad_norm": 0.11573004722595215, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7636 + }, + { + "epoch": 6.0998402555910545, + "grad_norm": 0.11995833367109299, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7637 + }, + { + "epoch": 6.100638977635783, + "grad_norm": 0.03895663470029831, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7638 + }, + { + "epoch": 6.1014376996805115, + "grad_norm": 0.11274216324090958, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7639 + }, + { + "epoch": 6.102236421725239, + "grad_norm": 0.14242613315582275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7640 + }, + { + "epoch": 6.103035143769968, + "grad_norm": 0.04954848438501358, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7641 + }, + { + "epoch": 6.103833865814696, + "grad_norm": 0.10814809799194336, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7642 + }, + { + "epoch": 6.104632587859425, + "grad_norm": 0.11696363240480423, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7643 + }, + { + "epoch": 6.105431309904153, + "grad_norm": 0.04597959294915199, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7644 + }, + { + "epoch": 6.106230031948882, + "grad_norm": 0.16304457187652588, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7645 + }, + { + "epoch": 6.10702875399361, + "grad_norm": 0.14835208654403687, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7646 + }, + { + "epoch": 6.107827476038339, + "grad_norm": 0.06062949076294899, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7647 + }, + { + "epoch": 6.108626198083067, + "grad_norm": 0.1033453568816185, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7648 + }, + { + "epoch": 6.109424920127796, + "grad_norm": 0.14823280274868011, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7649 + }, + { + "epoch": 6.110223642172524, + "grad_norm": 0.18282924592494965, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7650 + }, + { + "epoch": 6.111022364217252, + "grad_norm": 0.17962203919887543, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7651 + }, + { + "epoch": 6.111821086261981, + "grad_norm": 0.12176015228033066, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7652 + }, + { + "epoch": 6.112619808306709, + "grad_norm": 0.07326921075582504, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7653 + }, + { + "epoch": 6.113418530351438, + "grad_norm": 0.24457645416259766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7654 + }, + { + "epoch": 6.114217252396166, + "grad_norm": 0.1442916989326477, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7655 + }, + { + "epoch": 6.115015974440895, + "grad_norm": 0.0716436356306076, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7656 + }, + { + "epoch": 6.115814696485623, + "grad_norm": 0.20782648026943207, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7657 + }, + { + "epoch": 6.116613418530352, + "grad_norm": 0.1183728352189064, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7658 + }, + { + "epoch": 6.11741214057508, + "grad_norm": 0.13251493871212006, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7659 + }, + { + "epoch": 6.118210862619808, + "grad_norm": 0.21223802864551544, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7660 + }, + { + "epoch": 6.1190095846645365, + "grad_norm": 0.0811460018157959, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7661 + }, + { + "epoch": 6.119808306709265, + "grad_norm": 0.13528718054294586, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7662 + }, + { + "epoch": 6.1206070287539935, + "grad_norm": 0.11806038022041321, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7663 + }, + { + "epoch": 6.121405750798722, + "grad_norm": 0.10022544860839844, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7664 + }, + { + "epoch": 6.122204472843451, + "grad_norm": 0.21452540159225464, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7665 + }, + { + "epoch": 6.123003194888179, + "grad_norm": 0.11949847638607025, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7666 + }, + { + "epoch": 6.123801916932908, + "grad_norm": 0.12636634707450867, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7667 + }, + { + "epoch": 6.124600638977636, + "grad_norm": 0.17132572829723358, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7668 + }, + { + "epoch": 6.125399361022364, + "grad_norm": 0.1116800457239151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7669 + }, + { + "epoch": 6.126198083067092, + "grad_norm": 0.13965120911598206, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7670 + }, + { + "epoch": 6.126996805111821, + "grad_norm": 0.1346610188484192, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7671 + }, + { + "epoch": 6.127795527156549, + "grad_norm": 0.07977228611707687, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7672 + }, + { + "epoch": 6.128594249201278, + "grad_norm": 0.21412506699562073, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7673 + }, + { + "epoch": 6.1293929712460065, + "grad_norm": 0.172305628657341, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7674 + }, + { + "epoch": 6.130191693290735, + "grad_norm": 0.10782980173826218, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7675 + }, + { + "epoch": 6.1309904153354635, + "grad_norm": 0.23166432976722717, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7676 + }, + { + "epoch": 6.131789137380192, + "grad_norm": 0.12337028980255127, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7677 + }, + { + "epoch": 6.13258785942492, + "grad_norm": 0.11406251043081284, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7678 + }, + { + "epoch": 6.133386581469648, + "grad_norm": 0.19163282215595245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7679 + }, + { + "epoch": 6.134185303514377, + "grad_norm": 0.06671248376369476, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7680 + }, + { + "epoch": 6.134984025559105, + "grad_norm": 0.13190557062625885, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7681 + }, + { + "epoch": 6.135782747603834, + "grad_norm": 0.20761321485042572, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7682 + }, + { + "epoch": 6.136581469648562, + "grad_norm": 0.08118047565221786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7683 + }, + { + "epoch": 6.137380191693291, + "grad_norm": 0.1458984613418579, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7684 + }, + { + "epoch": 6.138178913738019, + "grad_norm": 0.1305929571390152, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7685 + }, + { + "epoch": 6.138977635782748, + "grad_norm": 0.0972108244895935, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7686 + }, + { + "epoch": 6.139776357827476, + "grad_norm": 0.14246216416358948, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7687 + }, + { + "epoch": 6.140575079872204, + "grad_norm": 0.04341820999979973, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7688 + }, + { + "epoch": 6.141373801916933, + "grad_norm": 0.127020001411438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7689 + }, + { + "epoch": 6.142172523961661, + "grad_norm": 0.08494339138269424, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7690 + }, + { + "epoch": 6.14297124600639, + "grad_norm": 0.11377454549074173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7691 + }, + { + "epoch": 6.143769968051118, + "grad_norm": 0.13752779364585876, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7692 + }, + { + "epoch": 6.144568690095847, + "grad_norm": 0.054878801107406616, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7693 + }, + { + "epoch": 6.145367412140575, + "grad_norm": 0.11313790827989578, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7694 + }, + { + "epoch": 6.146166134185304, + "grad_norm": 0.04388728365302086, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7695 + }, + { + "epoch": 6.146964856230032, + "grad_norm": 0.12842994928359985, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7696 + }, + { + "epoch": 6.147763578274761, + "grad_norm": 0.1374971568584442, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7697 + }, + { + "epoch": 6.1485623003194885, + "grad_norm": 0.1082429438829422, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7698 + }, + { + "epoch": 6.149361022364217, + "grad_norm": 0.14329178631305695, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7699 + }, + { + "epoch": 6.1501597444089455, + "grad_norm": 0.07794678211212158, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7700 + }, + { + "epoch": 6.150958466453674, + "grad_norm": 0.10680928826332092, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7701 + }, + { + "epoch": 6.151757188498403, + "grad_norm": 0.11628691852092743, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7702 + }, + { + "epoch": 6.152555910543131, + "grad_norm": 0.03565143793821335, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7703 + }, + { + "epoch": 6.15335463258786, + "grad_norm": 0.10634133219718933, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7704 + }, + { + "epoch": 6.154153354632588, + "grad_norm": 0.10307054221630096, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7705 + }, + { + "epoch": 6.154952076677317, + "grad_norm": 0.05591967701911926, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7706 + }, + { + "epoch": 6.155750798722044, + "grad_norm": 0.07205721735954285, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7707 + }, + { + "epoch": 6.156549520766773, + "grad_norm": 0.05020968243479729, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7708 + }, + { + "epoch": 6.157348242811501, + "grad_norm": 0.037087470293045044, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7709 + }, + { + "epoch": 6.15814696485623, + "grad_norm": 0.06322529166936874, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7710 + }, + { + "epoch": 6.1589456869009584, + "grad_norm": 0.03881093114614487, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7711 + }, + { + "epoch": 6.159744408945687, + "grad_norm": 0.06219052895903587, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7712 + }, + { + "epoch": 6.1605431309904155, + "grad_norm": 0.043313659727573395, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7713 + }, + { + "epoch": 6.161341853035144, + "grad_norm": 0.05460439994931221, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7714 + }, + { + "epoch": 6.162140575079873, + "grad_norm": 0.045017000287771225, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7715 + }, + { + "epoch": 6.1629392971246, + "grad_norm": 0.08029863983392715, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7716 + }, + { + "epoch": 6.163738019169329, + "grad_norm": 0.06935936212539673, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7717 + }, + { + "epoch": 6.164536741214057, + "grad_norm": 0.12617695331573486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7718 + }, + { + "epoch": 6.165335463258786, + "grad_norm": 0.09746283292770386, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7719 + }, + { + "epoch": 6.166134185303514, + "grad_norm": 0.038731649518013, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7720 + }, + { + "epoch": 6.166932907348243, + "grad_norm": 0.1054256334900856, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7721 + }, + { + "epoch": 6.167731629392971, + "grad_norm": 0.0833977535367012, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7722 + }, + { + "epoch": 6.1685303514377, + "grad_norm": 1.3529000282287598, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7723 + }, + { + "epoch": 6.169329073482428, + "grad_norm": 0.06748781353235245, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7724 + }, + { + "epoch": 6.170127795527157, + "grad_norm": 0.06015792861580849, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7725 + }, + { + "epoch": 6.170926517571885, + "grad_norm": 0.07760192453861237, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7726 + }, + { + "epoch": 6.171725239616613, + "grad_norm": 0.09536328911781311, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7727 + }, + { + "epoch": 6.172523961661342, + "grad_norm": 0.051248203963041306, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7728 + }, + { + "epoch": 6.17332268370607, + "grad_norm": 0.09610000252723694, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7729 + }, + { + "epoch": 6.174121405750799, + "grad_norm": 0.0803515687584877, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7730 + }, + { + "epoch": 6.174920127795527, + "grad_norm": 0.0820179283618927, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7731 + }, + { + "epoch": 6.175718849840256, + "grad_norm": 0.08880780637264252, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7732 + }, + { + "epoch": 6.176517571884984, + "grad_norm": 0.12188591808080673, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7733 + }, + { + "epoch": 6.177316293929713, + "grad_norm": 0.06245967745780945, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7734 + }, + { + "epoch": 6.178115015974441, + "grad_norm": 0.06608586013317108, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7735 + }, + { + "epoch": 6.178913738019169, + "grad_norm": 0.08542132377624512, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7736 + }, + { + "epoch": 6.1797124600638975, + "grad_norm": 0.06510723382234573, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7737 + }, + { + "epoch": 6.180511182108626, + "grad_norm": 0.161012202501297, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7738 + }, + { + "epoch": 6.181309904153355, + "grad_norm": 0.07943159341812134, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7739 + }, + { + "epoch": 6.182108626198083, + "grad_norm": 0.07735269516706467, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7740 + }, + { + "epoch": 6.182907348242812, + "grad_norm": 0.07452470809221268, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7741 + }, + { + "epoch": 6.18370607028754, + "grad_norm": 0.06378357857465744, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7742 + }, + { + "epoch": 6.184504792332269, + "grad_norm": 0.06149968132376671, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7743 + }, + { + "epoch": 6.185303514376997, + "grad_norm": 0.06558738648891449, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7744 + }, + { + "epoch": 6.186102236421725, + "grad_norm": 0.06004631146788597, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7745 + }, + { + "epoch": 6.186900958466453, + "grad_norm": 0.09972328692674637, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7746 + }, + { + "epoch": 6.187699680511182, + "grad_norm": 0.059344276785850525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7747 + }, + { + "epoch": 6.18849840255591, + "grad_norm": 0.15083496272563934, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7748 + }, + { + "epoch": 6.189297124600639, + "grad_norm": 0.08041606843471527, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7749 + }, + { + "epoch": 6.1900958466453675, + "grad_norm": 0.0801318883895874, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7750 + }, + { + "epoch": 6.190894568690096, + "grad_norm": 0.13313926756381989, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7751 + }, + { + "epoch": 6.1916932907348246, + "grad_norm": 0.07887420803308487, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7752 + }, + { + "epoch": 6.192492012779553, + "grad_norm": 0.08653397113084793, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7753 + }, + { + "epoch": 6.193290734824281, + "grad_norm": 0.12184617668390274, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7754 + }, + { + "epoch": 6.194089456869009, + "grad_norm": 0.05356535315513611, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7755 + }, + { + "epoch": 6.194888178913738, + "grad_norm": 0.09529519081115723, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7756 + }, + { + "epoch": 6.195686900958466, + "grad_norm": 0.07658126950263977, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7757 + }, + { + "epoch": 6.196485623003195, + "grad_norm": 0.0785149484872818, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7758 + }, + { + "epoch": 6.197284345047923, + "grad_norm": 0.10748651623725891, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7759 + }, + { + "epoch": 6.198083067092652, + "grad_norm": 0.056907687336206436, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7760 + }, + { + "epoch": 6.19888178913738, + "grad_norm": 0.3713622987270355, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7761 + }, + { + "epoch": 6.199680511182109, + "grad_norm": 0.16671019792556763, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7762 + }, + { + "epoch": 6.2004792332268375, + "grad_norm": 0.10214395076036453, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7763 + }, + { + "epoch": 6.201277955271565, + "grad_norm": 0.09181013703346252, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7764 + }, + { + "epoch": 6.202076677316294, + "grad_norm": 0.18003405630588531, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7765 + }, + { + "epoch": 6.202875399361022, + "grad_norm": 0.1032429188489914, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7766 + }, + { + "epoch": 6.203674121405751, + "grad_norm": 0.06787005811929703, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7767 + }, + { + "epoch": 6.204472843450479, + "grad_norm": 0.09422674775123596, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7768 + }, + { + "epoch": 6.205271565495208, + "grad_norm": 0.04083932563662529, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7769 + }, + { + "epoch": 6.206070287539936, + "grad_norm": 0.1368017941713333, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7770 + }, + { + "epoch": 6.206869009584665, + "grad_norm": 0.23276877403259277, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7771 + }, + { + "epoch": 6.207667731629393, + "grad_norm": 0.13092860579490662, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7772 + }, + { + "epoch": 6.208466453674121, + "grad_norm": 0.14030441641807556, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7773 + }, + { + "epoch": 6.2092651757188495, + "grad_norm": 0.2016047090291977, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7774 + }, + { + "epoch": 6.210063897763578, + "grad_norm": 0.1224871277809143, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7775 + }, + { + "epoch": 6.210862619808307, + "grad_norm": 0.10741977393627167, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7776 + }, + { + "epoch": 6.211661341853035, + "grad_norm": 0.19775021076202393, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7777 + }, + { + "epoch": 6.212460063897764, + "grad_norm": 0.06731278449296951, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7778 + }, + { + "epoch": 6.213258785942492, + "grad_norm": 0.14070862531661987, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7779 + }, + { + "epoch": 6.214057507987221, + "grad_norm": 0.1267949938774109, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7780 + }, + { + "epoch": 6.214856230031949, + "grad_norm": 0.0694371834397316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7781 + }, + { + "epoch": 6.215654952076678, + "grad_norm": 0.12222267687320709, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7782 + }, + { + "epoch": 6.216453674121405, + "grad_norm": 0.1105445921421051, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7783 + }, + { + "epoch": 6.217252396166134, + "grad_norm": 0.05993608012795448, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7784 + }, + { + "epoch": 6.218051118210862, + "grad_norm": 0.11157821118831635, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7785 + }, + { + "epoch": 6.218849840255591, + "grad_norm": 0.05242336913943291, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7786 + }, + { + "epoch": 6.2196485623003195, + "grad_norm": 0.046115025877952576, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7787 + }, + { + "epoch": 6.220447284345048, + "grad_norm": 0.04029909893870354, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7788 + }, + { + "epoch": 6.2212460063897765, + "grad_norm": 0.057172924280166626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7789 + }, + { + "epoch": 6.222044728434505, + "grad_norm": 0.04958837479352951, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7790 + }, + { + "epoch": 6.222843450479234, + "grad_norm": 0.046313852071762085, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7791 + }, + { + "epoch": 6.223642172523961, + "grad_norm": 0.03824630752205849, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7792 + }, + { + "epoch": 6.22444089456869, + "grad_norm": 0.07159019261598587, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7793 + }, + { + "epoch": 6.225239616613418, + "grad_norm": 0.06316389888525009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7794 + }, + { + "epoch": 6.226038338658147, + "grad_norm": 0.088447704911232, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7795 + }, + { + "epoch": 6.226837060702875, + "grad_norm": 0.08749943226575851, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7796 + }, + { + "epoch": 6.227635782747604, + "grad_norm": 0.08757520467042923, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7797 + }, + { + "epoch": 6.228434504792332, + "grad_norm": 0.10777202993631363, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7798 + }, + { + "epoch": 6.229233226837061, + "grad_norm": 0.15780584514141083, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7799 + }, + { + "epoch": 6.2300319488817895, + "grad_norm": 0.10375814139842987, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7800 + }, + { + "epoch": 6.230830670926518, + "grad_norm": 0.3544321656227112, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7801 + }, + { + "epoch": 6.231629392971246, + "grad_norm": 0.11117644608020782, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7802 + }, + { + "epoch": 6.232428115015974, + "grad_norm": 0.13096286356449127, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7803 + }, + { + "epoch": 6.233226837060703, + "grad_norm": 0.2706630229949951, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7804 + }, + { + "epoch": 6.234025559105431, + "grad_norm": 0.05805981904268265, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7805 + }, + { + "epoch": 6.23482428115016, + "grad_norm": 0.14731241762638092, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7806 + }, + { + "epoch": 6.235623003194888, + "grad_norm": 0.08912478387355804, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7807 + }, + { + "epoch": 6.236421725239617, + "grad_norm": 0.15754206478595734, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7808 + }, + { + "epoch": 6.237220447284345, + "grad_norm": 0.21143318712711334, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7809 + }, + { + "epoch": 6.238019169329074, + "grad_norm": 0.11839418858289719, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7810 + }, + { + "epoch": 6.2388178913738015, + "grad_norm": 0.23939856886863708, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7811 + }, + { + "epoch": 6.23961661341853, + "grad_norm": 0.1438305526971817, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7812 + }, + { + "epoch": 6.2404153354632586, + "grad_norm": 0.11111237108707428, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7813 + }, + { + "epoch": 6.241214057507987, + "grad_norm": 0.19577394425868988, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7814 + }, + { + "epoch": 6.242012779552716, + "grad_norm": 0.1399260312318802, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7815 + }, + { + "epoch": 6.242811501597444, + "grad_norm": 0.16393627226352692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7816 + }, + { + "epoch": 6.243610223642173, + "grad_norm": 0.15071940422058105, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7817 + }, + { + "epoch": 6.244408945686901, + "grad_norm": 0.2121957242488861, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7818 + }, + { + "epoch": 6.24520766773163, + "grad_norm": 0.09854442626237869, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7819 + }, + { + "epoch": 6.246006389776358, + "grad_norm": 0.1327667534351349, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7820 + }, + { + "epoch": 6.246805111821086, + "grad_norm": 0.13909243047237396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7821 + }, + { + "epoch": 6.247603833865814, + "grad_norm": 0.08482292294502258, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7822 + }, + { + "epoch": 6.248402555910543, + "grad_norm": 0.0918656438589096, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7823 + }, + { + "epoch": 6.2492012779552715, + "grad_norm": 0.1352611631155014, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7824 + }, + { + "epoch": 6.25, + "grad_norm": 0.06178867816925049, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7825 + }, + { + "epoch": 6.2507987220447285, + "grad_norm": 0.1285342425107956, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7826 + }, + { + "epoch": 6.251597444089457, + "grad_norm": 0.17862951755523682, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7827 + }, + { + "epoch": 6.252396166134186, + "grad_norm": 0.574928343296051, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7828 + }, + { + "epoch": 6.253194888178914, + "grad_norm": 0.11522867530584335, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 7829 + }, + { + "epoch": 6.253993610223642, + "grad_norm": 0.08348001539707184, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7830 + }, + { + "epoch": 6.25479233226837, + "grad_norm": 0.1015007346868515, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7831 + }, + { + "epoch": 6.255591054313099, + "grad_norm": 0.18213561177253723, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7832 + }, + { + "epoch": 6.256389776357827, + "grad_norm": 0.1056833565235138, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7833 + }, + { + "epoch": 6.257188498402556, + "grad_norm": 0.09715890139341354, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7834 + }, + { + "epoch": 6.257987220447284, + "grad_norm": 0.17651355266571045, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7835 + }, + { + "epoch": 6.258785942492013, + "grad_norm": 0.11858265846967697, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7836 + }, + { + "epoch": 6.2595846645367414, + "grad_norm": 0.1400168240070343, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7837 + }, + { + "epoch": 6.26038338658147, + "grad_norm": 0.2133244276046753, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7838 + }, + { + "epoch": 6.261182108626198, + "grad_norm": 0.087309330701828, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7839 + }, + { + "epoch": 6.261980830670926, + "grad_norm": 0.07735110074281693, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7840 + }, + { + "epoch": 6.262779552715655, + "grad_norm": 0.08314932882785797, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7841 + }, + { + "epoch": 6.263578274760383, + "grad_norm": 0.13448217511177063, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7842 + }, + { + "epoch": 6.264376996805112, + "grad_norm": 1.4022712707519531, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7843 + }, + { + "epoch": 6.26517571884984, + "grad_norm": 0.1107354387640953, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7844 + }, + { + "epoch": 6.265974440894569, + "grad_norm": 0.17282478511333466, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7845 + }, + { + "epoch": 6.266773162939297, + "grad_norm": 0.0903516560792923, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7846 + }, + { + "epoch": 6.267571884984026, + "grad_norm": 0.07628770172595978, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7847 + }, + { + "epoch": 6.268370607028754, + "grad_norm": 0.08877440541982651, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7848 + }, + { + "epoch": 6.269169329073483, + "grad_norm": 0.041159700602293015, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7849 + }, + { + "epoch": 6.2699680511182105, + "grad_norm": 0.09187504649162292, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7850 + }, + { + "epoch": 6.270766773162939, + "grad_norm": 0.11252478510141373, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7851 + }, + { + "epoch": 6.271565495207668, + "grad_norm": 0.04354100301861763, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7852 + }, + { + "epoch": 6.272364217252396, + "grad_norm": 0.06845738738775253, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7853 + }, + { + "epoch": 6.273162939297125, + "grad_norm": 0.047235157340765, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7854 + }, + { + "epoch": 6.273961661341853, + "grad_norm": 0.04571741819381714, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7855 + }, + { + "epoch": 6.274760383386582, + "grad_norm": 0.09801016747951508, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7856 + }, + { + "epoch": 6.27555910543131, + "grad_norm": 0.12422922253608704, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7857 + }, + { + "epoch": 6.276357827476039, + "grad_norm": 0.07283129543066025, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7858 + }, + { + "epoch": 6.277156549520766, + "grad_norm": 0.07217510044574738, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7859 + }, + { + "epoch": 6.277955271565495, + "grad_norm": 0.1102033257484436, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7860 + }, + { + "epoch": 6.2787539936102235, + "grad_norm": 0.0814276710152626, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7861 + }, + { + "epoch": 6.279552715654952, + "grad_norm": 0.08247577399015427, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7862 + }, + { + "epoch": 6.2803514376996805, + "grad_norm": 0.04042622447013855, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7863 + }, + { + "epoch": 6.281150159744409, + "grad_norm": 0.049153268337249756, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7864 + }, + { + "epoch": 6.281948881789138, + "grad_norm": 0.07062675058841705, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7865 + }, + { + "epoch": 6.282747603833866, + "grad_norm": 0.06458686292171478, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7866 + }, + { + "epoch": 6.283546325878595, + "grad_norm": 0.093512162566185, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7867 + }, + { + "epoch": 6.284345047923322, + "grad_norm": 0.054384954273700714, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7868 + }, + { + "epoch": 6.285143769968051, + "grad_norm": 0.06253736466169357, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7869 + }, + { + "epoch": 6.285942492012779, + "grad_norm": 0.05566808953881264, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7870 + }, + { + "epoch": 6.286741214057508, + "grad_norm": 0.07693472504615784, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7871 + }, + { + "epoch": 6.287539936102236, + "grad_norm": 0.04471312463283539, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7872 + }, + { + "epoch": 6.288338658146965, + "grad_norm": 0.050770796835422516, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7873 + }, + { + "epoch": 6.289137380191693, + "grad_norm": 0.04736769199371338, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7874 + }, + { + "epoch": 6.289936102236422, + "grad_norm": 0.06550426036119461, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7875 + }, + { + "epoch": 6.2907348242811505, + "grad_norm": 0.0524384006857872, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7876 + }, + { + "epoch": 6.291533546325878, + "grad_norm": 0.10091802477836609, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7877 + }, + { + "epoch": 6.292332268370607, + "grad_norm": 0.14296530187129974, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7878 + }, + { + "epoch": 6.293130990415335, + "grad_norm": 0.08703069388866425, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7879 + }, + { + "epoch": 6.293929712460064, + "grad_norm": 0.05628393217921257, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7880 + }, + { + "epoch": 6.294728434504792, + "grad_norm": 0.09164825826883316, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7881 + }, + { + "epoch": 6.295527156549521, + "grad_norm": 0.09182474762201309, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7882 + }, + { + "epoch": 6.296325878594249, + "grad_norm": 0.03495810180902481, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7883 + }, + { + "epoch": 6.297124600638978, + "grad_norm": 0.07738466560840607, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7884 + }, + { + "epoch": 6.297923322683706, + "grad_norm": 0.06034242361783981, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7885 + }, + { + "epoch": 6.298722044728435, + "grad_norm": 0.04083844646811485, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7886 + }, + { + "epoch": 6.2995207667731625, + "grad_norm": 0.0918336734175682, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7887 + }, + { + "epoch": 6.300319488817891, + "grad_norm": 0.07351864874362946, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7888 + }, + { + "epoch": 6.30111821086262, + "grad_norm": 0.042986564338207245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7889 + }, + { + "epoch": 6.301916932907348, + "grad_norm": 0.05983031541109085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7890 + }, + { + "epoch": 6.302715654952077, + "grad_norm": 0.10980594903230667, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7891 + }, + { + "epoch": 6.303514376996805, + "grad_norm": 0.04517138749361038, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7892 + }, + { + "epoch": 6.304313099041534, + "grad_norm": 0.08489427715539932, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7893 + }, + { + "epoch": 6.305111821086262, + "grad_norm": 0.040421262383461, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7894 + }, + { + "epoch": 6.305910543130991, + "grad_norm": 0.0438009649515152, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7895 + }, + { + "epoch": 6.306709265175719, + "grad_norm": 0.05797100067138672, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7896 + }, + { + "epoch": 6.307507987220447, + "grad_norm": 0.08798980712890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7897 + }, + { + "epoch": 6.3083067092651754, + "grad_norm": 0.0502130500972271, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7898 + }, + { + "epoch": 6.309105431309904, + "grad_norm": 0.11610639840364456, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7899 + }, + { + "epoch": 6.3099041533546325, + "grad_norm": 0.061168819665908813, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7900 + }, + { + "epoch": 6.310702875399361, + "grad_norm": 0.0469425804913044, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7901 + }, + { + "epoch": 6.31150159744409, + "grad_norm": 0.0483059324324131, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7902 + }, + { + "epoch": 6.312300319488818, + "grad_norm": 0.120233453810215, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7903 + }, + { + "epoch": 6.313099041533547, + "grad_norm": 0.10025710612535477, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7904 + }, + { + "epoch": 6.313897763578275, + "grad_norm": 0.08750995993614197, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7905 + }, + { + "epoch": 6.314696485623003, + "grad_norm": 0.31308433413505554, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7906 + }, + { + "epoch": 6.315495207667731, + "grad_norm": 0.06390809267759323, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7907 + }, + { + "epoch": 6.31629392971246, + "grad_norm": 0.0657041072845459, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7908 + }, + { + "epoch": 6.317092651757188, + "grad_norm": 0.09626918286085129, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7909 + }, + { + "epoch": 6.317891373801917, + "grad_norm": 0.05565343424677849, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7910 + }, + { + "epoch": 6.318690095846645, + "grad_norm": 0.06147831678390503, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7911 + }, + { + "epoch": 6.319488817891374, + "grad_norm": 0.08704033493995667, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7912 + }, + { + "epoch": 6.3202875399361025, + "grad_norm": 0.04405020549893379, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7913 + }, + { + "epoch": 6.321086261980831, + "grad_norm": 0.07587708532810211, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7914 + }, + { + "epoch": 6.321884984025559, + "grad_norm": 0.05935811623930931, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7915 + }, + { + "epoch": 6.322683706070287, + "grad_norm": 0.045584313571453094, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7916 + }, + { + "epoch": 6.323482428115016, + "grad_norm": 0.065196193754673, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7917 + }, + { + "epoch": 6.324281150159744, + "grad_norm": 0.05996553227305412, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7918 + }, + { + "epoch": 6.325079872204473, + "grad_norm": 0.04771357774734497, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7919 + }, + { + "epoch": 6.325878594249201, + "grad_norm": 0.05875687673687935, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7920 + }, + { + "epoch": 6.32667731629393, + "grad_norm": 0.15765227377414703, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7921 + }, + { + "epoch": 6.327476038338658, + "grad_norm": 0.038563717156648636, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7922 + }, + { + "epoch": 6.328274760383387, + "grad_norm": 0.04321083426475525, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7923 + }, + { + "epoch": 6.329073482428115, + "grad_norm": 0.04427725449204445, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7924 + }, + { + "epoch": 6.329872204472843, + "grad_norm": 0.06047825515270233, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7925 + }, + { + "epoch": 6.330670926517572, + "grad_norm": 0.05161035805940628, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7926 + }, + { + "epoch": 6.3314696485623, + "grad_norm": 0.06512151658535004, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7927 + }, + { + "epoch": 6.332268370607029, + "grad_norm": 0.05178358778357506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7928 + }, + { + "epoch": 6.333067092651757, + "grad_norm": 0.06199260801076889, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7929 + }, + { + "epoch": 6.333865814696486, + "grad_norm": 0.09948168694972992, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7930 + }, + { + "epoch": 6.334664536741214, + "grad_norm": 0.06568150222301483, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7931 + }, + { + "epoch": 6.335463258785943, + "grad_norm": 0.036642882972955704, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7932 + }, + { + "epoch": 6.336261980830671, + "grad_norm": 0.04814688116312027, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7933 + }, + { + "epoch": 6.3370607028754, + "grad_norm": 0.03938854858279228, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7934 + }, + { + "epoch": 6.337859424920127, + "grad_norm": 0.07778320461511612, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7935 + }, + { + "epoch": 6.338658146964856, + "grad_norm": 0.16271090507507324, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7936 + }, + { + "epoch": 6.3394568690095845, + "grad_norm": 0.3652990460395813, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7937 + }, + { + "epoch": 6.340255591054313, + "grad_norm": 0.0592365525662899, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7938 + }, + { + "epoch": 6.3410543130990416, + "grad_norm": 0.28622883558273315, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7939 + }, + { + "epoch": 6.34185303514377, + "grad_norm": 0.2270730584859848, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7940 + }, + { + "epoch": 6.342651757188499, + "grad_norm": 0.10781756043434143, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7941 + }, + { + "epoch": 6.343450479233227, + "grad_norm": 0.11611706018447876, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7942 + }, + { + "epoch": 6.344249201277956, + "grad_norm": 0.08212626725435257, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7943 + }, + { + "epoch": 6.345047923322683, + "grad_norm": 0.0739196389913559, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7944 + }, + { + "epoch": 6.345846645367412, + "grad_norm": 0.1029743030667305, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7945 + }, + { + "epoch": 6.34664536741214, + "grad_norm": 0.2787686586380005, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7946 + }, + { + "epoch": 6.347444089456869, + "grad_norm": 0.12180152535438538, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7947 + }, + { + "epoch": 6.348242811501597, + "grad_norm": 0.178681880235672, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7948 + }, + { + "epoch": 6.349041533546326, + "grad_norm": 0.10219722986221313, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7949 + }, + { + "epoch": 6.3498402555910545, + "grad_norm": 0.0773158147931099, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7950 + }, + { + "epoch": 6.350638977635783, + "grad_norm": 0.15096192061901093, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7951 + }, + { + "epoch": 6.3514376996805115, + "grad_norm": 0.06237277388572693, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7952 + }, + { + "epoch": 6.352236421725239, + "grad_norm": 1.4819257259368896, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7953 + }, + { + "epoch": 6.353035143769968, + "grad_norm": 0.09716464579105377, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7954 + }, + { + "epoch": 6.353833865814696, + "grad_norm": 0.10105668753385544, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7955 + }, + { + "epoch": 6.354632587859425, + "grad_norm": 0.09361526370048523, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7956 + }, + { + "epoch": 6.355431309904153, + "grad_norm": 0.04209212213754654, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7957 + }, + { + "epoch": 6.356230031948882, + "grad_norm": 0.11653190106153488, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7958 + }, + { + "epoch": 6.35702875399361, + "grad_norm": 0.1552112102508545, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7959 + }, + { + "epoch": 6.357827476038339, + "grad_norm": 0.07934660464525223, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7960 + }, + { + "epoch": 6.358626198083067, + "grad_norm": 0.10928693413734436, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7961 + }, + { + "epoch": 6.359424920127796, + "grad_norm": 0.15923380851745605, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7962 + }, + { + "epoch": 6.360223642172524, + "grad_norm": 0.12151104211807251, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7963 + }, + { + "epoch": 6.361022364217252, + "grad_norm": 0.055971868336200714, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7964 + }, + { + "epoch": 6.361821086261981, + "grad_norm": 0.17611366510391235, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7965 + }, + { + "epoch": 6.362619808306709, + "grad_norm": 0.16098986566066742, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7966 + }, + { + "epoch": 6.363418530351438, + "grad_norm": 1.6793769598007202, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7967 + }, + { + "epoch": 6.364217252396166, + "grad_norm": 0.4322223365306854, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 7968 + }, + { + "epoch": 6.365015974440895, + "grad_norm": 0.35510173439979553, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 7969 + }, + { + "epoch": 6.365814696485623, + "grad_norm": 0.08799898624420166, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7970 + }, + { + "epoch": 6.366613418530352, + "grad_norm": 0.28774675726890564, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7971 + }, + { + "epoch": 6.36741214057508, + "grad_norm": 0.28109011054039, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7972 + }, + { + "epoch": 6.368210862619808, + "grad_norm": 0.09055986255407333, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7973 + }, + { + "epoch": 6.3690095846645365, + "grad_norm": 0.15083353221416473, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7974 + }, + { + "epoch": 6.369808306709265, + "grad_norm": 0.20686668157577515, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7975 + }, + { + "epoch": 6.3706070287539935, + "grad_norm": 0.047575660049915314, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 7976 + }, + { + "epoch": 6.371405750798722, + "grad_norm": 0.25424477458000183, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7977 + }, + { + "epoch": 6.372204472843451, + "grad_norm": 0.21839222311973572, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7978 + }, + { + "epoch": 6.373003194888179, + "grad_norm": 0.06493431329727173, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7979 + }, + { + "epoch": 6.373801916932908, + "grad_norm": 0.2369518280029297, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 7980 + }, + { + "epoch": 6.374600638977636, + "grad_norm": 0.14641214907169342, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7981 + }, + { + "epoch": 6.375399361022364, + "grad_norm": 0.11602997034788132, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7982 + }, + { + "epoch": 6.376198083067092, + "grad_norm": 0.18792425096035004, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7983 + }, + { + "epoch": 6.376996805111821, + "grad_norm": 0.06824373453855515, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7984 + }, + { + "epoch": 6.377795527156549, + "grad_norm": 0.1228032335639, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7985 + }, + { + "epoch": 6.378594249201278, + "grad_norm": 0.15771286189556122, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7986 + }, + { + "epoch": 6.3793929712460065, + "grad_norm": 0.1157795861363411, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7987 + }, + { + "epoch": 6.380191693290735, + "grad_norm": 0.07282877713441849, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7988 + }, + { + "epoch": 6.3809904153354635, + "grad_norm": 0.10168643295764923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7989 + }, + { + "epoch": 6.381789137380192, + "grad_norm": 0.24466580152511597, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7990 + }, + { + "epoch": 6.38258785942492, + "grad_norm": 0.0972297191619873, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7991 + }, + { + "epoch": 6.383386581469648, + "grad_norm": 0.08349917083978653, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7992 + }, + { + "epoch": 6.384185303514377, + "grad_norm": 0.058114584535360336, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7993 + }, + { + "epoch": 6.384984025559105, + "grad_norm": 0.04745171591639519, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7994 + }, + { + "epoch": 6.385782747603834, + "grad_norm": 0.05484034866094589, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7995 + }, + { + "epoch": 6.386581469648562, + "grad_norm": 0.05094960704445839, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7996 + }, + { + "epoch": 6.387380191693291, + "grad_norm": 0.06368618458509445, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7997 + }, + { + "epoch": 6.388178913738019, + "grad_norm": 0.07042541354894638, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7998 + }, + { + "epoch": 6.388977635782748, + "grad_norm": 0.06182365491986275, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7999 + }, + { + "epoch": 6.389776357827476, + "grad_norm": 0.05778853967785835, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8000 + }, + { + "epoch": 6.390575079872204, + "grad_norm": 0.04334365949034691, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8001 + }, + { + "epoch": 6.391373801916933, + "grad_norm": 0.08214148133993149, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8002 + }, + { + "epoch": 6.392172523961661, + "grad_norm": 0.05468964949250221, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8003 + }, + { + "epoch": 6.39297124600639, + "grad_norm": 0.07484348863363266, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8004 + }, + { + "epoch": 6.393769968051118, + "grad_norm": 0.04987887665629387, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8005 + }, + { + "epoch": 6.394568690095847, + "grad_norm": 0.05584597587585449, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8006 + }, + { + "epoch": 6.395367412140575, + "grad_norm": 0.07088904082775116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8007 + }, + { + "epoch": 6.396166134185304, + "grad_norm": 0.26695576310157776, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8008 + }, + { + "epoch": 6.396964856230032, + "grad_norm": 0.06452658027410507, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8009 + }, + { + "epoch": 6.397763578274761, + "grad_norm": 0.08994145691394806, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8010 + }, + { + "epoch": 6.3985623003194885, + "grad_norm": 0.06565240770578384, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8011 + }, + { + "epoch": 6.399361022364217, + "grad_norm": 0.0492648184299469, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8012 + }, + { + "epoch": 6.4001597444089455, + "grad_norm": 0.06946985423564911, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8013 + }, + { + "epoch": 6.400958466453674, + "grad_norm": 0.08669331669807434, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8014 + }, + { + "epoch": 6.401757188498403, + "grad_norm": 0.07930289953947067, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8015 + }, + { + "epoch": 6.402555910543131, + "grad_norm": 0.15216746926307678, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8016 + }, + { + "epoch": 6.40335463258786, + "grad_norm": 0.051862914115190506, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8017 + }, + { + "epoch": 6.404153354632588, + "grad_norm": 0.044119443744421005, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8018 + }, + { + "epoch": 6.404952076677317, + "grad_norm": 0.09787813574075699, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8019 + }, + { + "epoch": 6.405750798722044, + "grad_norm": 0.05269203707575798, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8020 + }, + { + "epoch": 6.406549520766773, + "grad_norm": 0.06683865934610367, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8021 + }, + { + "epoch": 6.407348242811501, + "grad_norm": 0.04334628954529762, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8022 + }, + { + "epoch": 6.40814696485623, + "grad_norm": 0.037559930235147476, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8023 + }, + { + "epoch": 6.4089456869009584, + "grad_norm": 0.21066749095916748, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8024 + }, + { + "epoch": 6.409744408945687, + "grad_norm": 0.05721563845872879, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8025 + }, + { + "epoch": 6.4105431309904155, + "grad_norm": 0.047683823853731155, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8026 + }, + { + "epoch": 6.411341853035144, + "grad_norm": 0.05377231910824776, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8027 + }, + { + "epoch": 6.412140575079873, + "grad_norm": 0.05604357272386551, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8028 + }, + { + "epoch": 6.4129392971246, + "grad_norm": 0.051680225878953934, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8029 + }, + { + "epoch": 6.413738019169329, + "grad_norm": 0.04465701803565025, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8030 + }, + { + "epoch": 6.414536741214057, + "grad_norm": 0.0454387366771698, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8031 + }, + { + "epoch": 6.415335463258786, + "grad_norm": 0.5079139471054077, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8032 + }, + { + "epoch": 6.416134185303514, + "grad_norm": 0.08386353403329849, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8033 + }, + { + "epoch": 6.416932907348243, + "grad_norm": 0.06023477017879486, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8034 + }, + { + "epoch": 6.417731629392971, + "grad_norm": 0.8634743094444275, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8035 + }, + { + "epoch": 6.4185303514377, + "grad_norm": 0.06926131993532181, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8036 + }, + { + "epoch": 6.419329073482428, + "grad_norm": 0.07563464343547821, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8037 + }, + { + "epoch": 6.420127795527157, + "grad_norm": 0.10181237757205963, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8038 + }, + { + "epoch": 6.420926517571885, + "grad_norm": 0.13995511829853058, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8039 + }, + { + "epoch": 6.421725239616613, + "grad_norm": 0.05968187376856804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8040 + }, + { + "epoch": 6.422523961661342, + "grad_norm": 0.14419680833816528, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8041 + }, + { + "epoch": 6.42332268370607, + "grad_norm": 0.13762469589710236, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8042 + }, + { + "epoch": 6.424121405750799, + "grad_norm": 0.0627644956111908, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8043 + }, + { + "epoch": 6.424920127795527, + "grad_norm": 0.1356768012046814, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8044 + }, + { + "epoch": 6.425718849840256, + "grad_norm": 0.12080833315849304, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8045 + }, + { + "epoch": 6.426517571884984, + "grad_norm": 0.048654112964868546, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8046 + }, + { + "epoch": 6.427316293929713, + "grad_norm": 0.11983022093772888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8047 + }, + { + "epoch": 6.428115015974441, + "grad_norm": 0.09429550170898438, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8048 + }, + { + "epoch": 6.428913738019169, + "grad_norm": 0.07924454659223557, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8049 + }, + { + "epoch": 6.4297124600638975, + "grad_norm": 0.15244926512241364, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8050 + }, + { + "epoch": 6.430511182108626, + "grad_norm": 0.9872325658798218, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8051 + }, + { + "epoch": 6.431309904153355, + "grad_norm": 0.0790395438671112, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8052 + }, + { + "epoch": 6.432108626198083, + "grad_norm": 0.3828068673610687, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8053 + }, + { + "epoch": 6.432907348242812, + "grad_norm": 0.059630244970321655, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8054 + }, + { + "epoch": 6.43370607028754, + "grad_norm": 0.07113327085971832, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8055 + }, + { + "epoch": 6.434504792332269, + "grad_norm": 0.0496523454785347, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8056 + }, + { + "epoch": 6.435303514376997, + "grad_norm": 0.08502436429262161, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8057 + }, + { + "epoch": 6.436102236421725, + "grad_norm": 0.06082376837730408, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8058 + }, + { + "epoch": 6.436900958466453, + "grad_norm": 0.1668524146080017, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8059 + }, + { + "epoch": 6.437699680511182, + "grad_norm": 0.05411513149738312, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8060 + }, + { + "epoch": 6.43849840255591, + "grad_norm": 0.05176519230008125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8061 + }, + { + "epoch": 6.439297124600639, + "grad_norm": 0.0684237852692604, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8062 + }, + { + "epoch": 6.4400958466453675, + "grad_norm": 0.0715038925409317, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8063 + }, + { + "epoch": 6.440894568690096, + "grad_norm": 0.11311113089323044, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8064 + }, + { + "epoch": 6.4416932907348246, + "grad_norm": 0.06320979446172714, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8065 + }, + { + "epoch": 6.442492012779553, + "grad_norm": 0.09221892803907394, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8066 + }, + { + "epoch": 6.443290734824281, + "grad_norm": 0.1183326244354248, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8067 + }, + { + "epoch": 6.444089456869009, + "grad_norm": 0.08447464555501938, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8068 + }, + { + "epoch": 6.444888178913738, + "grad_norm": 0.21791045367717743, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8069 + }, + { + "epoch": 6.445686900958466, + "grad_norm": 0.055015772581100464, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8070 + }, + { + "epoch": 6.446485623003195, + "grad_norm": 0.13536514341831207, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8071 + }, + { + "epoch": 6.447284345047923, + "grad_norm": 0.16620422899723053, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8072 + }, + { + "epoch": 6.448083067092652, + "grad_norm": 0.08793147653341293, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8073 + }, + { + "epoch": 6.44888178913738, + "grad_norm": 0.0962347462773323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8074 + }, + { + "epoch": 6.449680511182109, + "grad_norm": 0.08764681965112686, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 8075 + }, + { + "epoch": 6.4504792332268375, + "grad_norm": 0.06176106259226799, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8076 + }, + { + "epoch": 6.451277955271565, + "grad_norm": 0.06823577731847763, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8077 + }, + { + "epoch": 6.452076677316294, + "grad_norm": 0.11239560693502426, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8078 + }, + { + "epoch": 6.452875399361022, + "grad_norm": 0.10309527069330215, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8079 + }, + { + "epoch": 6.453674121405751, + "grad_norm": 0.07533836364746094, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8080 + }, + { + "epoch": 6.454472843450479, + "grad_norm": 0.06650671362876892, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8081 + }, + { + "epoch": 6.455271565495208, + "grad_norm": 0.1700691431760788, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8082 + }, + { + "epoch": 6.456070287539936, + "grad_norm": 0.06135572865605354, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8083 + }, + { + "epoch": 6.456869009584665, + "grad_norm": 0.08333424478769302, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8084 + }, + { + "epoch": 6.457667731629393, + "grad_norm": 0.1338927149772644, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8085 + }, + { + "epoch": 6.458466453674122, + "grad_norm": 0.07097163796424866, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8086 + }, + { + "epoch": 6.4592651757188495, + "grad_norm": 0.06296008080244064, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8087 + }, + { + "epoch": 6.460063897763578, + "grad_norm": 0.060656916350126266, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8088 + }, + { + "epoch": 6.460862619808307, + "grad_norm": 0.044889576733112335, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8089 + }, + { + "epoch": 6.461661341853035, + "grad_norm": 0.0749807357788086, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8090 + }, + { + "epoch": 6.462460063897764, + "grad_norm": 0.07509054243564606, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8091 + }, + { + "epoch": 6.463258785942492, + "grad_norm": 0.054954417049884796, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8092 + }, + { + "epoch": 6.464057507987221, + "grad_norm": 0.05087047815322876, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8093 + }, + { + "epoch": 6.464856230031949, + "grad_norm": 0.12205887585878372, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8094 + }, + { + "epoch": 6.465654952076678, + "grad_norm": 0.08342424035072327, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8095 + }, + { + "epoch": 6.466453674121405, + "grad_norm": 0.12507228553295135, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8096 + }, + { + "epoch": 6.467252396166134, + "grad_norm": 0.10491037368774414, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8097 + }, + { + "epoch": 6.468051118210862, + "grad_norm": 0.04236119985580444, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8098 + }, + { + "epoch": 6.468849840255591, + "grad_norm": 0.10601458698511124, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8099 + }, + { + "epoch": 6.4696485623003195, + "grad_norm": 0.07485921680927277, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8100 + }, + { + "epoch": 6.470447284345048, + "grad_norm": 0.06351220607757568, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8101 + }, + { + "epoch": 6.4712460063897765, + "grad_norm": 0.08351211249828339, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8102 + }, + { + "epoch": 6.472044728434505, + "grad_norm": 0.07205908000469208, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8103 + }, + { + "epoch": 6.472843450479234, + "grad_norm": 0.07072018831968307, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8104 + }, + { + "epoch": 6.473642172523961, + "grad_norm": 0.0851733461022377, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8105 + }, + { + "epoch": 6.47444089456869, + "grad_norm": 0.07046044617891312, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8106 + }, + { + "epoch": 6.475239616613418, + "grad_norm": 0.03804340958595276, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8107 + }, + { + "epoch": 6.476038338658147, + "grad_norm": 0.059083763509988785, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8108 + }, + { + "epoch": 6.476837060702875, + "grad_norm": 0.0419149249792099, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8109 + }, + { + "epoch": 6.477635782747604, + "grad_norm": 0.07814865559339523, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8110 + }, + { + "epoch": 6.478434504792332, + "grad_norm": 0.12653781473636627, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8111 + }, + { + "epoch": 6.479233226837061, + "grad_norm": 0.10124429315328598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8112 + }, + { + "epoch": 6.4800319488817895, + "grad_norm": 0.05563808232545853, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8113 + }, + { + "epoch": 6.480830670926517, + "grad_norm": 0.07036174833774567, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8114 + }, + { + "epoch": 6.481629392971246, + "grad_norm": 0.0452839694917202, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8115 + }, + { + "epoch": 6.482428115015974, + "grad_norm": 0.13880759477615356, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8116 + }, + { + "epoch": 6.483226837060703, + "grad_norm": 0.03902722895145416, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8117 + }, + { + "epoch": 6.484025559105431, + "grad_norm": 0.08136945217847824, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8118 + }, + { + "epoch": 6.48482428115016, + "grad_norm": 0.09874774515628815, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8119 + }, + { + "epoch": 6.485623003194888, + "grad_norm": 0.06836161017417908, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8120 + }, + { + "epoch": 6.486421725239617, + "grad_norm": 0.1439940482378006, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8121 + }, + { + "epoch": 6.487220447284345, + "grad_norm": 0.0924125388264656, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8122 + }, + { + "epoch": 6.488019169329074, + "grad_norm": 0.06811019778251648, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8123 + }, + { + "epoch": 6.488817891373802, + "grad_norm": 0.1259799599647522, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8124 + }, + { + "epoch": 6.48961661341853, + "grad_norm": 0.1088009849190712, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8125 + }, + { + "epoch": 6.4904153354632586, + "grad_norm": 0.27054721117019653, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8126 + }, + { + "epoch": 6.491214057507987, + "grad_norm": 0.09674181789159775, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8127 + }, + { + "epoch": 6.492012779552716, + "grad_norm": 0.15491390228271484, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8128 + }, + { + "epoch": 6.492811501597444, + "grad_norm": 0.08790267258882523, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8129 + }, + { + "epoch": 6.493610223642173, + "grad_norm": 0.19372408092021942, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8130 + }, + { + "epoch": 6.494408945686901, + "grad_norm": 0.14786171913146973, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8131 + }, + { + "epoch": 6.49520766773163, + "grad_norm": 0.09591338783502579, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8132 + }, + { + "epoch": 6.496006389776358, + "grad_norm": 0.1810663491487503, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8133 + }, + { + "epoch": 6.496805111821086, + "grad_norm": 0.19754691421985626, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8134 + }, + { + "epoch": 6.497603833865814, + "grad_norm": 0.14094877243041992, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8135 + }, + { + "epoch": 6.498402555910543, + "grad_norm": 0.0782506987452507, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8136 + }, + { + "epoch": 6.4992012779552715, + "grad_norm": 0.19543413817882538, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8137 + }, + { + "epoch": 6.5, + "grad_norm": 0.3102439045906067, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8138 + }, + { + "epoch": 6.5007987220447285, + "grad_norm": 0.13952040672302246, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8139 + }, + { + "epoch": 6.501597444089457, + "grad_norm": 0.1902403086423874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8140 + }, + { + "epoch": 6.502396166134186, + "grad_norm": 0.2608654499053955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8141 + }, + { + "epoch": 6.503194888178914, + "grad_norm": 0.22480152547359467, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8142 + }, + { + "epoch": 6.503993610223642, + "grad_norm": 0.21580660343170166, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8143 + }, + { + "epoch": 6.50479233226837, + "grad_norm": 0.1991831213235855, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8144 + }, + { + "epoch": 6.505591054313099, + "grad_norm": 0.25885632634162903, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8145 + }, + { + "epoch": 6.506389776357827, + "grad_norm": 0.2533574104309082, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8146 + }, + { + "epoch": 6.507188498402556, + "grad_norm": 0.11494381725788116, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8147 + }, + { + "epoch": 6.507987220447284, + "grad_norm": 0.1361113339662552, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8148 + }, + { + "epoch": 6.508785942492013, + "grad_norm": 0.22099947929382324, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8149 + }, + { + "epoch": 6.5095846645367414, + "grad_norm": 0.13223077356815338, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8150 + }, + { + "epoch": 6.51038338658147, + "grad_norm": 0.18203037977218628, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8151 + }, + { + "epoch": 6.511182108626198, + "grad_norm": 0.18066702783107758, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8152 + }, + { + "epoch": 6.511980830670926, + "grad_norm": 0.09984144568443298, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8153 + }, + { + "epoch": 6.512779552715655, + "grad_norm": 0.12803718447685242, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8154 + }, + { + "epoch": 6.513578274760383, + "grad_norm": 0.19731956720352173, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8155 + }, + { + "epoch": 6.514376996805112, + "grad_norm": 0.10687378793954849, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8156 + }, + { + "epoch": 6.51517571884984, + "grad_norm": 0.0971442237496376, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8157 + }, + { + "epoch": 6.515974440894569, + "grad_norm": 0.12840867042541504, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8158 + }, + { + "epoch": 6.516773162939297, + "grad_norm": 0.1245417669415474, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8159 + }, + { + "epoch": 6.517571884984026, + "grad_norm": 0.16850991547107697, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8160 + }, + { + "epoch": 6.518370607028754, + "grad_norm": 0.1931404322385788, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8161 + }, + { + "epoch": 6.519169329073483, + "grad_norm": 0.08180713653564453, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8162 + }, + { + "epoch": 6.5199680511182105, + "grad_norm": 0.24530328810214996, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8163 + }, + { + "epoch": 6.520766773162939, + "grad_norm": 0.14107894897460938, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8164 + }, + { + "epoch": 6.521565495207668, + "grad_norm": 0.07984111458063126, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8165 + }, + { + "epoch": 6.522364217252396, + "grad_norm": 0.20894968509674072, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8166 + }, + { + "epoch": 6.523162939297125, + "grad_norm": 0.09663927555084229, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8167 + }, + { + "epoch": 6.523961661341853, + "grad_norm": 0.0913434773683548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8168 + }, + { + "epoch": 6.524760383386582, + "grad_norm": 0.1247463971376419, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8169 + }, + { + "epoch": 6.52555910543131, + "grad_norm": 0.06504802405834198, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8170 + }, + { + "epoch": 6.526357827476039, + "grad_norm": 0.10900555551052094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8171 + }, + { + "epoch": 6.527156549520766, + "grad_norm": 0.047379642724990845, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8172 + }, + { + "epoch": 6.527955271565495, + "grad_norm": 0.17822134494781494, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8173 + }, + { + "epoch": 6.5287539936102235, + "grad_norm": 0.07658754289150238, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8174 + }, + { + "epoch": 6.529552715654952, + "grad_norm": 0.17294292151927948, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8175 + }, + { + "epoch": 6.5303514376996805, + "grad_norm": 0.07095851004123688, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8176 + }, + { + "epoch": 6.531150159744409, + "grad_norm": 0.07328472286462784, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8177 + }, + { + "epoch": 6.531948881789138, + "grad_norm": 0.11216691881418228, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8178 + }, + { + "epoch": 6.532747603833866, + "grad_norm": 0.3007374703884125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8179 + }, + { + "epoch": 6.533546325878595, + "grad_norm": 0.06059226021170616, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8180 + }, + { + "epoch": 6.534345047923322, + "grad_norm": 0.14438967406749725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8181 + }, + { + "epoch": 6.535143769968051, + "grad_norm": 0.1965394914150238, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8182 + }, + { + "epoch": 6.535942492012779, + "grad_norm": 0.130478173494339, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8183 + }, + { + "epoch": 6.536741214057508, + "grad_norm": 0.16713190078735352, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8184 + }, + { + "epoch": 6.537539936102236, + "grad_norm": 0.18644076585769653, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8185 + }, + { + "epoch": 6.538338658146965, + "grad_norm": 0.06685839593410492, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8186 + }, + { + "epoch": 6.539137380191693, + "grad_norm": 0.17819803953170776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8187 + }, + { + "epoch": 6.539936102236422, + "grad_norm": 0.5894746780395508, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8188 + }, + { + "epoch": 6.5407348242811505, + "grad_norm": 0.088719442486763, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8189 + }, + { + "epoch": 6.541533546325878, + "grad_norm": 0.1336045265197754, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8190 + }, + { + "epoch": 6.542332268370607, + "grad_norm": 0.12859520316123962, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8191 + }, + { + "epoch": 6.543130990415335, + "grad_norm": 0.13402487337589264, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8192 + }, + { + "epoch": 6.543929712460064, + "grad_norm": 0.11415290832519531, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8193 + }, + { + "epoch": 6.544728434504792, + "grad_norm": 0.1775715947151184, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8194 + }, + { + "epoch": 6.545527156549521, + "grad_norm": 0.6331294775009155, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8195 + }, + { + "epoch": 6.546325878594249, + "grad_norm": 0.09323445707559586, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8196 + }, + { + "epoch": 6.547124600638978, + "grad_norm": 0.1761421412229538, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8197 + }, + { + "epoch": 6.547923322683706, + "grad_norm": 0.09608824551105499, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8198 + }, + { + "epoch": 6.548722044728435, + "grad_norm": 0.07564207166433334, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8199 + }, + { + "epoch": 6.549520766773163, + "grad_norm": 0.08033318817615509, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8200 + }, + { + "epoch": 6.550319488817891, + "grad_norm": 0.13604776561260223, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8201 + }, + { + "epoch": 6.55111821086262, + "grad_norm": 0.1046299859881401, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8202 + }, + { + "epoch": 6.551916932907348, + "grad_norm": 0.23783712089061737, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8203 + }, + { + "epoch": 6.552715654952077, + "grad_norm": 0.07360750436782837, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8204 + }, + { + "epoch": 6.553514376996805, + "grad_norm": 0.07213526219129562, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8205 + }, + { + "epoch": 6.554313099041534, + "grad_norm": 0.12431066483259201, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8206 + }, + { + "epoch": 6.555111821086262, + "grad_norm": 0.09665104001760483, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8207 + }, + { + "epoch": 6.555910543130991, + "grad_norm": 0.22090987861156464, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8208 + }, + { + "epoch": 6.556709265175719, + "grad_norm": 0.14936690032482147, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8209 + }, + { + "epoch": 6.557507987220447, + "grad_norm": 0.09804648160934448, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8210 + }, + { + "epoch": 6.5583067092651754, + "grad_norm": 0.07829400897026062, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8211 + }, + { + "epoch": 6.559105431309904, + "grad_norm": 0.08218041807413101, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8212 + }, + { + "epoch": 6.5599041533546325, + "grad_norm": 0.08018422871828079, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8213 + }, + { + "epoch": 6.560702875399361, + "grad_norm": 0.07790627330541611, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8214 + }, + { + "epoch": 6.56150159744409, + "grad_norm": 0.12526501715183258, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8215 + }, + { + "epoch": 6.562300319488818, + "grad_norm": 0.15222279727458954, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8216 + }, + { + "epoch": 6.563099041533547, + "grad_norm": 0.19605369865894318, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8217 + }, + { + "epoch": 6.563897763578275, + "grad_norm": 1.4426831007003784, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8218 + }, + { + "epoch": 6.564696485623003, + "grad_norm": 0.184299498796463, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8219 + }, + { + "epoch": 6.565495207667731, + "grad_norm": 0.12029392272233963, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8220 + }, + { + "epoch": 6.56629392971246, + "grad_norm": 0.07442726939916611, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8221 + }, + { + "epoch": 6.567092651757188, + "grad_norm": 0.14331156015396118, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8222 + }, + { + "epoch": 6.567891373801917, + "grad_norm": 0.11202000081539154, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 8223 + }, + { + "epoch": 6.568690095846645, + "grad_norm": 0.10699515789747238, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8224 + }, + { + "epoch": 6.569488817891374, + "grad_norm": 0.07708705961704254, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8225 + }, + { + "epoch": 6.5702875399361025, + "grad_norm": 0.08026644587516785, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8226 + }, + { + "epoch": 6.571086261980831, + "grad_norm": 0.08694002777338028, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8227 + }, + { + "epoch": 6.571884984025559, + "grad_norm": 0.11824248731136322, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8228 + }, + { + "epoch": 6.572683706070287, + "grad_norm": 0.06505008041858673, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8229 + }, + { + "epoch": 6.573482428115016, + "grad_norm": 0.05341152846813202, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8230 + }, + { + "epoch": 6.574281150159744, + "grad_norm": 0.09604120999574661, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8231 + }, + { + "epoch": 6.575079872204473, + "grad_norm": 0.08336330950260162, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8232 + }, + { + "epoch": 6.575878594249201, + "grad_norm": 0.06368359923362732, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8233 + }, + { + "epoch": 6.57667731629393, + "grad_norm": 0.13115698099136353, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8234 + }, + { + "epoch": 6.577476038338658, + "grad_norm": 0.08847527951002121, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8235 + }, + { + "epoch": 6.578274760383387, + "grad_norm": 0.0458359532058239, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8236 + }, + { + "epoch": 6.5790734824281145, + "grad_norm": 0.10106709599494934, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8237 + }, + { + "epoch": 6.579872204472844, + "grad_norm": 0.06641486287117004, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8238 + }, + { + "epoch": 6.580670926517572, + "grad_norm": 0.0733480304479599, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8239 + }, + { + "epoch": 6.5814696485623, + "grad_norm": 0.07835566252470016, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8240 + }, + { + "epoch": 6.582268370607029, + "grad_norm": 0.13473013043403625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8241 + }, + { + "epoch": 6.583067092651757, + "grad_norm": 0.062259674072265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8242 + }, + { + "epoch": 6.583865814696486, + "grad_norm": 0.05236242339015007, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8243 + }, + { + "epoch": 6.584664536741214, + "grad_norm": 0.08255355805158615, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8244 + }, + { + "epoch": 6.585463258785943, + "grad_norm": 0.1182556301355362, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8245 + }, + { + "epoch": 6.586261980830671, + "grad_norm": 0.0555981881916523, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8246 + }, + { + "epoch": 6.5870607028754, + "grad_norm": 0.09490877389907837, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8247 + }, + { + "epoch": 6.587859424920127, + "grad_norm": 0.6106880903244019, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8248 + }, + { + "epoch": 6.588658146964856, + "grad_norm": 0.0474761538207531, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8249 + }, + { + "epoch": 6.5894568690095845, + "grad_norm": 0.1429997831583023, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8250 + }, + { + "epoch": 6.590255591054313, + "grad_norm": 0.0815487951040268, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8251 + }, + { + "epoch": 6.5910543130990416, + "grad_norm": 0.096903957426548, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8252 + }, + { + "epoch": 6.59185303514377, + "grad_norm": 0.17775478959083557, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8253 + }, + { + "epoch": 6.592651757188499, + "grad_norm": 0.11637275665998459, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8254 + }, + { + "epoch": 6.593450479233227, + "grad_norm": 0.08475788682699203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8255 + }, + { + "epoch": 6.594249201277956, + "grad_norm": 0.1786298304796219, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8256 + }, + { + "epoch": 6.595047923322683, + "grad_norm": 0.12316745519638062, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8257 + }, + { + "epoch": 6.595846645367412, + "grad_norm": 0.5367861986160278, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8258 + }, + { + "epoch": 6.59664536741214, + "grad_norm": 0.2289825677871704, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8259 + }, + { + "epoch": 6.597444089456869, + "grad_norm": 0.17333106696605682, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8260 + }, + { + "epoch": 6.598242811501597, + "grad_norm": 0.10858172923326492, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8261 + }, + { + "epoch": 6.599041533546326, + "grad_norm": 0.2013384997844696, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8262 + }, + { + "epoch": 6.5998402555910545, + "grad_norm": 0.13658639788627625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8263 + }, + { + "epoch": 6.600638977635783, + "grad_norm": 0.12755805253982544, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8264 + }, + { + "epoch": 6.6014376996805115, + "grad_norm": 0.18299050629138947, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 6.602236421725239, + "grad_norm": 0.07105828821659088, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8266 + }, + { + "epoch": 6.603035143769968, + "grad_norm": 0.13049830496311188, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8267 + }, + { + "epoch": 6.603833865814696, + "grad_norm": 0.16121532022953033, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8268 + }, + { + "epoch": 6.604632587859425, + "grad_norm": 0.07512015104293823, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8269 + }, + { + "epoch": 6.605431309904153, + "grad_norm": 0.17407254874706268, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8270 + }, + { + "epoch": 6.606230031948882, + "grad_norm": 0.11297854781150818, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8271 + }, + { + "epoch": 6.60702875399361, + "grad_norm": 0.2839175760746002, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8272 + }, + { + "epoch": 6.607827476038339, + "grad_norm": 0.07847599685192108, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8273 + }, + { + "epoch": 6.608626198083067, + "grad_norm": 0.08995212614536285, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8274 + }, + { + "epoch": 6.609424920127795, + "grad_norm": 0.07382770627737045, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8275 + }, + { + "epoch": 6.6102236421725244, + "grad_norm": 0.06170637533068657, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8276 + }, + { + "epoch": 6.611022364217252, + "grad_norm": 0.07311394810676575, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8277 + }, + { + "epoch": 6.611821086261981, + "grad_norm": 0.06827707588672638, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8278 + }, + { + "epoch": 6.612619808306709, + "grad_norm": 0.05261022970080376, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8279 + }, + { + "epoch": 6.613418530351438, + "grad_norm": 0.11326271295547485, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8280 + }, + { + "epoch": 6.614217252396166, + "grad_norm": 0.1652819961309433, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8281 + }, + { + "epoch": 6.615015974440895, + "grad_norm": 0.10749676078557968, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8282 + }, + { + "epoch": 6.615814696485623, + "grad_norm": 0.20359984040260315, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8283 + }, + { + "epoch": 6.616613418530352, + "grad_norm": 0.18771138787269592, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8284 + }, + { + "epoch": 6.61741214057508, + "grad_norm": 2.5382773876190186, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8285 + }, + { + "epoch": 6.618210862619808, + "grad_norm": 0.30566683411598206, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8286 + }, + { + "epoch": 6.6190095846645365, + "grad_norm": 0.3638366758823395, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8287 + }, + { + "epoch": 6.619808306709265, + "grad_norm": 0.10939022153615952, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8288 + }, + { + "epoch": 6.6206070287539935, + "grad_norm": 0.3243744969367981, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8289 + }, + { + "epoch": 6.621405750798722, + "grad_norm": 0.2703976333141327, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8290 + }, + { + "epoch": 6.622204472843451, + "grad_norm": 0.06998306512832642, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8291 + }, + { + "epoch": 6.623003194888179, + "grad_norm": 0.25409170985221863, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8292 + }, + { + "epoch": 6.623801916932908, + "grad_norm": 0.110246442258358, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8293 + }, + { + "epoch": 6.624600638977636, + "grad_norm": 0.1667647659778595, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8294 + }, + { + "epoch": 6.625399361022364, + "grad_norm": 0.17452718317508698, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8295 + }, + { + "epoch": 6.626198083067092, + "grad_norm": 0.11691702157258987, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8296 + }, + { + "epoch": 6.626996805111821, + "grad_norm": 0.14679500460624695, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8297 + }, + { + "epoch": 6.627795527156549, + "grad_norm": 0.06978808343410492, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8298 + }, + { + "epoch": 6.628594249201278, + "grad_norm": 0.36758533120155334, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8299 + }, + { + "epoch": 6.6293929712460065, + "grad_norm": 0.11101481318473816, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8300 + }, + { + "epoch": 6.630191693290735, + "grad_norm": 0.11762239784002304, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8301 + }, + { + "epoch": 6.6309904153354635, + "grad_norm": 0.11467000097036362, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8302 + }, + { + "epoch": 6.631789137380192, + "grad_norm": 0.14236292243003845, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8303 + }, + { + "epoch": 6.63258785942492, + "grad_norm": 0.050860557705163956, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8304 + }, + { + "epoch": 6.633386581469648, + "grad_norm": 0.07763084024190903, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 8305 + }, + { + "epoch": 6.634185303514377, + "grad_norm": 0.06728993356227875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8306 + }, + { + "epoch": 6.634984025559105, + "grad_norm": 0.06984454393386841, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8307 + }, + { + "epoch": 6.635782747603834, + "grad_norm": 0.09839699417352676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8308 + }, + { + "epoch": 6.636581469648562, + "grad_norm": 0.1262810379266739, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8309 + }, + { + "epoch": 6.637380191693291, + "grad_norm": 0.08147390931844711, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8310 + }, + { + "epoch": 6.638178913738019, + "grad_norm": 0.11567803472280502, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8311 + }, + { + "epoch": 6.638977635782748, + "grad_norm": 0.14972445368766785, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8312 + }, + { + "epoch": 6.6397763578274756, + "grad_norm": 0.2970331609249115, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8313 + }, + { + "epoch": 6.640575079872205, + "grad_norm": 0.05576174706220627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8314 + }, + { + "epoch": 6.641373801916933, + "grad_norm": 0.048716023564338684, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8315 + }, + { + "epoch": 6.642172523961661, + "grad_norm": 0.05986058712005615, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8316 + }, + { + "epoch": 6.64297124600639, + "grad_norm": 0.07985493540763855, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8317 + }, + { + "epoch": 6.643769968051118, + "grad_norm": 0.5361261963844299, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 8318 + }, + { + "epoch": 6.644568690095847, + "grad_norm": 0.15383858978748322, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8319 + }, + { + "epoch": 6.645367412140575, + "grad_norm": 0.17428068816661835, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8320 + }, + { + "epoch": 6.646166134185304, + "grad_norm": 0.09801791608333588, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8321 + }, + { + "epoch": 6.646964856230032, + "grad_norm": 0.11805883049964905, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8322 + }, + { + "epoch": 6.647763578274761, + "grad_norm": 0.13135986030101776, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8323 + }, + { + "epoch": 6.6485623003194885, + "grad_norm": 0.10351908206939697, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8324 + }, + { + "epoch": 6.649361022364217, + "grad_norm": 0.11086217314004898, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8325 + }, + { + "epoch": 6.6501597444089455, + "grad_norm": 0.1173853799700737, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8326 + }, + { + "epoch": 6.650958466453674, + "grad_norm": 0.10743618756532669, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8327 + }, + { + "epoch": 6.651757188498403, + "grad_norm": 0.5378667116165161, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8328 + }, + { + "epoch": 6.652555910543131, + "grad_norm": 0.5077546834945679, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8329 + }, + { + "epoch": 6.65335463258786, + "grad_norm": 0.21998530626296997, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8330 + }, + { + "epoch": 6.654153354632588, + "grad_norm": 0.1235295757651329, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8331 + }, + { + "epoch": 6.654952076677317, + "grad_norm": 0.7328196167945862, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 8332 + }, + { + "epoch": 6.655750798722044, + "grad_norm": 0.12249958515167236, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8333 + }, + { + "epoch": 6.656549520766773, + "grad_norm": 0.12837325036525726, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8334 + }, + { + "epoch": 6.657348242811501, + "grad_norm": 0.09456688165664673, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8335 + }, + { + "epoch": 6.65814696485623, + "grad_norm": 0.13044698536396027, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8336 + }, + { + "epoch": 6.6589456869009584, + "grad_norm": 0.13105876743793488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8337 + }, + { + "epoch": 6.659744408945687, + "grad_norm": 0.14498500525951385, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8338 + }, + { + "epoch": 6.6605431309904155, + "grad_norm": 0.08840721845626831, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8339 + }, + { + "epoch": 6.661341853035144, + "grad_norm": 1.276719570159912, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8340 + }, + { + "epoch": 6.662140575079873, + "grad_norm": 0.36189836263656616, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 8341 + }, + { + "epoch": 6.6629392971246, + "grad_norm": 0.6304068565368652, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 8342 + }, + { + "epoch": 6.663738019169329, + "grad_norm": 0.524870753288269, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 8343 + }, + { + "epoch": 6.664536741214057, + "grad_norm": 0.14638005197048187, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8344 + }, + { + "epoch": 6.665335463258786, + "grad_norm": 0.3090416491031647, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 8345 + }, + { + "epoch": 6.666134185303514, + "grad_norm": 0.1549086570739746, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8346 + }, + { + "epoch": 6.666932907348243, + "grad_norm": 0.36996960639953613, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8347 + }, + { + "epoch": 6.667731629392971, + "grad_norm": 0.4879205524921417, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 8348 + }, + { + "epoch": 6.6685303514377, + "grad_norm": 0.6129382848739624, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8349 + }, + { + "epoch": 6.669329073482428, + "grad_norm": 0.37913191318511963, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8350 + }, + { + "epoch": 6.670127795527156, + "grad_norm": 0.1678311973810196, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 8351 + }, + { + "epoch": 6.6709265175718855, + "grad_norm": 0.17131182551383972, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8352 + }, + { + "epoch": 6.671725239616613, + "grad_norm": 0.29875028133392334, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8353 + }, + { + "epoch": 6.672523961661342, + "grad_norm": 0.5288842916488647, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8354 + }, + { + "epoch": 6.67332268370607, + "grad_norm": 0.24637238681316376, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8355 + }, + { + "epoch": 6.674121405750799, + "grad_norm": 0.25089535117149353, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8356 + }, + { + "epoch": 6.674920127795527, + "grad_norm": 0.5517246723175049, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8357 + }, + { + "epoch": 6.675718849840256, + "grad_norm": 0.07291965931653976, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8358 + }, + { + "epoch": 6.676517571884984, + "grad_norm": 0.2561021149158478, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8359 + }, + { + "epoch": 6.677316293929713, + "grad_norm": 0.2184453308582306, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8360 + }, + { + "epoch": 6.678115015974441, + "grad_norm": 0.10715393722057343, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8361 + }, + { + "epoch": 6.678913738019169, + "grad_norm": 0.16824330389499664, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8362 + }, + { + "epoch": 6.6797124600638975, + "grad_norm": 0.22539092600345612, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8363 + }, + { + "epoch": 6.680511182108626, + "grad_norm": 0.11956257373094559, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8364 + }, + { + "epoch": 6.681309904153355, + "grad_norm": 0.2023434042930603, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8365 + }, + { + "epoch": 6.682108626198083, + "grad_norm": 0.26878416538238525, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8366 + }, + { + "epoch": 6.682907348242812, + "grad_norm": 0.11318770796060562, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8367 + }, + { + "epoch": 6.68370607028754, + "grad_norm": 0.29282090067863464, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8368 + }, + { + "epoch": 6.684504792332269, + "grad_norm": 0.23825445771217346, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8369 + }, + { + "epoch": 6.685303514376997, + "grad_norm": 0.27186012268066406, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 8370 + }, + { + "epoch": 6.686102236421725, + "grad_norm": 0.28540825843811035, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8371 + }, + { + "epoch": 6.686900958466453, + "grad_norm": 0.14273707568645477, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8372 + }, + { + "epoch": 6.687699680511182, + "grad_norm": 0.3684747815132141, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8373 + }, + { + "epoch": 6.68849840255591, + "grad_norm": 0.23812046647071838, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8374 + }, + { + "epoch": 6.689297124600639, + "grad_norm": 0.15459395945072174, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8375 + }, + { + "epoch": 6.6900958466453675, + "grad_norm": 0.28762584924697876, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8376 + }, + { + "epoch": 6.690894568690096, + "grad_norm": 0.16686615347862244, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8377 + }, + { + "epoch": 6.6916932907348246, + "grad_norm": 0.16456246376037598, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8378 + }, + { + "epoch": 6.692492012779553, + "grad_norm": 0.2991560399532318, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8379 + }, + { + "epoch": 6.693290734824281, + "grad_norm": 0.14811092615127563, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8380 + }, + { + "epoch": 6.694089456869009, + "grad_norm": 0.14380809664726257, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8381 + }, + { + "epoch": 6.694888178913738, + "grad_norm": 0.0801207646727562, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8382 + }, + { + "epoch": 6.695686900958466, + "grad_norm": 0.08404620736837387, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8383 + }, + { + "epoch": 6.696485623003195, + "grad_norm": 0.1137305274605751, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8384 + }, + { + "epoch": 6.697284345047923, + "grad_norm": 0.08207721263170242, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8385 + }, + { + "epoch": 6.698083067092652, + "grad_norm": 0.09234748780727386, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8386 + }, + { + "epoch": 6.69888178913738, + "grad_norm": 0.29589149355888367, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8387 + }, + { + "epoch": 6.699680511182109, + "grad_norm": 0.2142077386379242, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8388 + }, + { + "epoch": 6.700479233226837, + "grad_norm": 0.10343299061059952, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8389 + }, + { + "epoch": 6.701277955271565, + "grad_norm": 0.12988241016864777, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8390 + }, + { + "epoch": 6.702076677316294, + "grad_norm": 0.20497195422649384, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 8391 + }, + { + "epoch": 6.702875399361022, + "grad_norm": 0.10697030276060104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8392 + }, + { + "epoch": 6.703674121405751, + "grad_norm": 0.1844921112060547, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8393 + }, + { + "epoch": 6.704472843450479, + "grad_norm": 0.13283176720142365, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8394 + }, + { + "epoch": 6.705271565495208, + "grad_norm": 0.14544987678527832, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8395 + }, + { + "epoch": 6.706070287539936, + "grad_norm": 0.10253588855266571, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8396 + }, + { + "epoch": 6.706869009584665, + "grad_norm": 0.11183217167854309, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8397 + }, + { + "epoch": 6.707667731629393, + "grad_norm": 0.12705212831497192, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8398 + }, + { + "epoch": 6.708466453674122, + "grad_norm": 0.08835884928703308, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8399 + }, + { + "epoch": 6.7092651757188495, + "grad_norm": 0.22377537190914154, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8400 + }, + { + "epoch": 6.710063897763578, + "grad_norm": 0.7205986976623535, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8401 + }, + { + "epoch": 6.710862619808307, + "grad_norm": 0.07383892685174942, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8402 + }, + { + "epoch": 6.711661341853035, + "grad_norm": 0.11109078675508499, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8403 + }, + { + "epoch": 6.712460063897764, + "grad_norm": 0.10979527235031128, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8404 + }, + { + "epoch": 6.713258785942492, + "grad_norm": 0.062491416931152344, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8405 + }, + { + "epoch": 6.714057507987221, + "grad_norm": 0.11196211725473404, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8406 + }, + { + "epoch": 6.714856230031949, + "grad_norm": 0.07815852016210556, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8407 + }, + { + "epoch": 6.715654952076678, + "grad_norm": 3.9684712886810303, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8408 + }, + { + "epoch": 6.716453674121405, + "grad_norm": 0.11982189118862152, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8409 + }, + { + "epoch": 6.717252396166134, + "grad_norm": 0.22319400310516357, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8410 + }, + { + "epoch": 6.718051118210862, + "grad_norm": 0.0937948003411293, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8411 + }, + { + "epoch": 6.718849840255591, + "grad_norm": 0.09193865954875946, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8412 + }, + { + "epoch": 6.7196485623003195, + "grad_norm": 0.08838166296482086, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8413 + }, + { + "epoch": 6.720447284345048, + "grad_norm": 0.0960271805524826, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8414 + }, + { + "epoch": 6.7212460063897765, + "grad_norm": 0.07488188147544861, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8415 + }, + { + "epoch": 6.722044728434505, + "grad_norm": 0.08563253283500671, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8416 + }, + { + "epoch": 6.722843450479234, + "grad_norm": 0.16766750812530518, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8417 + }, + { + "epoch": 6.723642172523961, + "grad_norm": 0.12811559438705444, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8418 + }, + { + "epoch": 6.72444089456869, + "grad_norm": 0.12410838901996613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8419 + }, + { + "epoch": 6.725239616613418, + "grad_norm": 0.1354755014181137, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8420 + }, + { + "epoch": 6.726038338658147, + "grad_norm": 0.17771920561790466, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8421 + }, + { + "epoch": 6.726837060702875, + "grad_norm": 0.19576571881771088, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8422 + }, + { + "epoch": 6.727635782747604, + "grad_norm": 0.5415527820587158, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 8423 + }, + { + "epoch": 6.728434504792332, + "grad_norm": 0.6647717952728271, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8424 + }, + { + "epoch": 6.729233226837061, + "grad_norm": 0.16329380869865417, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8425 + }, + { + "epoch": 6.7300319488817895, + "grad_norm": 0.4046335518360138, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8426 + }, + { + "epoch": 6.730830670926517, + "grad_norm": 0.1817079335451126, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8427 + }, + { + "epoch": 6.731629392971246, + "grad_norm": 0.3438379466533661, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 8428 + }, + { + "epoch": 6.732428115015974, + "grad_norm": 0.48276495933532715, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8429 + }, + { + "epoch": 6.733226837060703, + "grad_norm": 0.4002913236618042, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8430 + }, + { + "epoch": 6.734025559105431, + "grad_norm": 0.37833303213119507, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8431 + }, + { + "epoch": 6.73482428115016, + "grad_norm": 0.26374873518943787, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8432 + }, + { + "epoch": 6.735623003194888, + "grad_norm": 0.19766554236412048, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8433 + }, + { + "epoch": 6.736421725239617, + "grad_norm": 0.1996731013059616, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8434 + }, + { + "epoch": 6.737220447284345, + "grad_norm": 0.19733403623104095, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8435 + }, + { + "epoch": 6.738019169329074, + "grad_norm": 0.24423246085643768, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8436 + }, + { + "epoch": 6.738817891373802, + "grad_norm": 0.4329655170440674, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8437 + }, + { + "epoch": 6.73961661341853, + "grad_norm": 0.6964716911315918, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8438 + }, + { + "epoch": 6.7404153354632586, + "grad_norm": 0.12961135804653168, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8439 + }, + { + "epoch": 6.741214057507987, + "grad_norm": 0.2783071994781494, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8440 + }, + { + "epoch": 6.742012779552716, + "grad_norm": 0.3446369767189026, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8441 + }, + { + "epoch": 6.742811501597444, + "grad_norm": 0.22592051327228546, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8442 + }, + { + "epoch": 6.743610223642173, + "grad_norm": 0.06710102409124374, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8443 + }, + { + "epoch": 6.744408945686901, + "grad_norm": 0.2268608957529068, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8444 + }, + { + "epoch": 6.74520766773163, + "grad_norm": 0.08200005441904068, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8445 + }, + { + "epoch": 6.746006389776358, + "grad_norm": 0.2357168197631836, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8446 + }, + { + "epoch": 6.746805111821086, + "grad_norm": 0.20047837495803833, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8447 + }, + { + "epoch": 6.747603833865814, + "grad_norm": 0.2309340387582779, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8448 + }, + { + "epoch": 6.748402555910543, + "grad_norm": 0.11635745316743851, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8449 + }, + { + "epoch": 6.7492012779552715, + "grad_norm": 0.4076550602912903, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8450 + }, + { + "epoch": 6.75, + "grad_norm": 0.3500226140022278, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8451 + }, + { + "epoch": 6.7507987220447285, + "grad_norm": 0.2993873357772827, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8452 + }, + { + "epoch": 6.751597444089457, + "grad_norm": 0.1099642813205719, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8453 + }, + { + "epoch": 6.752396166134186, + "grad_norm": 0.17455045878887177, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8454 + }, + { + "epoch": 6.753194888178914, + "grad_norm": 0.12831585109233856, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8455 + }, + { + "epoch": 6.753993610223642, + "grad_norm": 0.1048964336514473, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8456 + }, + { + "epoch": 6.75479233226837, + "grad_norm": 0.16713464260101318, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8457 + }, + { + "epoch": 6.755591054313099, + "grad_norm": 0.07837880402803421, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8458 + }, + { + "epoch": 6.756389776357827, + "grad_norm": 0.17375724017620087, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8459 + }, + { + "epoch": 6.757188498402556, + "grad_norm": 0.9700595140457153, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8460 + }, + { + "epoch": 6.757987220447284, + "grad_norm": 0.23614056408405304, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8461 + }, + { + "epoch": 6.758785942492013, + "grad_norm": 0.2536165416240692, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 8462 + }, + { + "epoch": 6.7595846645367414, + "grad_norm": 0.26688873767852783, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 8463 + }, + { + "epoch": 6.76038338658147, + "grad_norm": 0.3807159662246704, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 8464 + }, + { + "epoch": 6.761182108626198, + "grad_norm": 0.2132156789302826, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8465 + }, + { + "epoch": 6.761980830670926, + "grad_norm": 0.19821512699127197, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 8466 + }, + { + "epoch": 6.762779552715655, + "grad_norm": 0.23694948852062225, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 8467 + }, + { + "epoch": 6.763578274760383, + "grad_norm": 0.1524704396724701, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 8468 + }, + { + "epoch": 6.764376996805112, + "grad_norm": 0.26719930768013, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 8469 + }, + { + "epoch": 6.76517571884984, + "grad_norm": 0.12077363580465317, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8470 + }, + { + "epoch": 6.765974440894569, + "grad_norm": 0.14398355782032013, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8471 + }, + { + "epoch": 6.766773162939297, + "grad_norm": 0.1972649097442627, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8472 + }, + { + "epoch": 6.767571884984026, + "grad_norm": 0.10172676295042038, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8473 + }, + { + "epoch": 6.768370607028754, + "grad_norm": 0.10743385553359985, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8474 + }, + { + "epoch": 6.769169329073483, + "grad_norm": 0.06148320063948631, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8475 + }, + { + "epoch": 6.7699680511182105, + "grad_norm": 0.08771604299545288, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8476 + }, + { + "epoch": 6.770766773162939, + "grad_norm": 0.13444122672080994, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8477 + }, + { + "epoch": 6.771565495207668, + "grad_norm": 0.4677158296108246, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8478 + }, + { + "epoch": 6.772364217252396, + "grad_norm": 0.08972432464361191, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8479 + }, + { + "epoch": 6.773162939297125, + "grad_norm": 0.10502214729785919, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8480 + }, + { + "epoch": 6.773961661341853, + "grad_norm": 0.14014923572540283, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8481 + }, + { + "epoch": 6.774760383386582, + "grad_norm": 0.3244888484477997, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8482 + }, + { + "epoch": 6.77555910543131, + "grad_norm": 0.20495742559432983, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8483 + }, + { + "epoch": 6.776357827476039, + "grad_norm": 0.15609663724899292, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8484 + }, + { + "epoch": 6.777156549520766, + "grad_norm": 0.13948239386081696, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8485 + }, + { + "epoch": 6.777955271565495, + "grad_norm": 0.28558677434921265, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8486 + }, + { + "epoch": 6.7787539936102235, + "grad_norm": 0.1481117457151413, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8487 + }, + { + "epoch": 6.779552715654952, + "grad_norm": 0.31998512148857117, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8488 + }, + { + "epoch": 6.7803514376996805, + "grad_norm": 0.1945921927690506, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8489 + }, + { + "epoch": 6.781150159744409, + "grad_norm": 18.217361450195312, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8490 + }, + { + "epoch": 6.781948881789138, + "grad_norm": 0.23472756147384644, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 8491 + }, + { + "epoch": 6.782747603833866, + "grad_norm": 0.10026291757822037, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8492 + }, + { + "epoch": 6.783546325878595, + "grad_norm": 0.14418581128120422, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8493 + }, + { + "epoch": 6.784345047923322, + "grad_norm": 0.14439892768859863, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8494 + }, + { + "epoch": 6.785143769968051, + "grad_norm": 0.37140071392059326, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8495 + }, + { + "epoch": 6.785942492012779, + "grad_norm": 0.09995266050100327, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8496 + }, + { + "epoch": 6.786741214057508, + "grad_norm": 0.08430355042219162, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8497 + }, + { + "epoch": 6.787539936102236, + "grad_norm": 0.11121980845928192, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8498 + }, + { + "epoch": 6.788338658146965, + "grad_norm": 0.20520392060279846, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8499 + }, + { + "epoch": 6.789137380191693, + "grad_norm": 0.10163573920726776, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8500 + }, + { + "epoch": 6.789936102236422, + "grad_norm": 0.12025435268878937, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8501 + }, + { + "epoch": 6.7907348242811505, + "grad_norm": 0.12003593891859055, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8502 + }, + { + "epoch": 6.791533546325878, + "grad_norm": 0.11013154685497284, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8503 + }, + { + "epoch": 6.792332268370607, + "grad_norm": 0.10089465230703354, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8504 + }, + { + "epoch": 6.793130990415335, + "grad_norm": 0.06270314007997513, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8505 + }, + { + "epoch": 6.793929712460064, + "grad_norm": 0.08571597188711166, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8506 + }, + { + "epoch": 6.794728434504792, + "grad_norm": 0.5324975848197937, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8507 + }, + { + "epoch": 6.795527156549521, + "grad_norm": 0.24500170350074768, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8508 + }, + { + "epoch": 6.796325878594249, + "grad_norm": 0.10234003514051437, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8509 + }, + { + "epoch": 6.797124600638978, + "grad_norm": 0.09924131631851196, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8510 + }, + { + "epoch": 6.797923322683706, + "grad_norm": 0.1413181573152542, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8511 + }, + { + "epoch": 6.798722044728435, + "grad_norm": 0.12095441669225693, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8512 + }, + { + "epoch": 6.799520766773163, + "grad_norm": 0.08617071062326431, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8513 + }, + { + "epoch": 6.800319488817891, + "grad_norm": 0.17984576523303986, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 8514 + }, + { + "epoch": 6.80111821086262, + "grad_norm": 0.16447608172893524, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8515 + }, + { + "epoch": 6.801916932907348, + "grad_norm": 0.15486668050289154, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8516 + }, + { + "epoch": 6.802715654952077, + "grad_norm": 0.10176295787096024, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8517 + }, + { + "epoch": 6.803514376996805, + "grad_norm": 0.14911721646785736, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8518 + }, + { + "epoch": 6.804313099041534, + "grad_norm": 0.11073625087738037, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8519 + }, + { + "epoch": 6.805111821086262, + "grad_norm": 0.10299605876207352, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8520 + }, + { + "epoch": 6.805910543130991, + "grad_norm": 0.189669668674469, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8521 + }, + { + "epoch": 6.806709265175719, + "grad_norm": 0.12226799875497818, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8522 + }, + { + "epoch": 6.807507987220447, + "grad_norm": 0.17778469622135162, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8523 + }, + { + "epoch": 6.8083067092651754, + "grad_norm": 0.16370487213134766, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8524 + }, + { + "epoch": 6.809105431309904, + "grad_norm": 0.05171172693371773, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8525 + }, + { + "epoch": 6.8099041533546325, + "grad_norm": 0.16393537819385529, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8526 + }, + { + "epoch": 6.810702875399361, + "grad_norm": 0.09398743510246277, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8527 + }, + { + "epoch": 6.81150159744409, + "grad_norm": 0.08430743217468262, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8528 + }, + { + "epoch": 6.812300319488818, + "grad_norm": 0.1131691113114357, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 8529 + }, + { + "epoch": 6.813099041533547, + "grad_norm": 0.0907130092382431, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8530 + }, + { + "epoch": 6.813897763578275, + "grad_norm": 0.1460096687078476, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8531 + }, + { + "epoch": 6.814696485623003, + "grad_norm": 0.07953288406133652, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8532 + }, + { + "epoch": 6.815495207667731, + "grad_norm": 0.061827294528484344, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 8533 + }, + { + "epoch": 6.81629392971246, + "grad_norm": 0.09172365814447403, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8534 + }, + { + "epoch": 6.817092651757188, + "grad_norm": 0.05858466029167175, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8535 + }, + { + "epoch": 6.817891373801917, + "grad_norm": 0.13774308562278748, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8536 + }, + { + "epoch": 6.818690095846645, + "grad_norm": 0.09840130060911179, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8537 + }, + { + "epoch": 6.819488817891374, + "grad_norm": 0.06836584210395813, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8538 + }, + { + "epoch": 6.8202875399361025, + "grad_norm": 0.15930971503257751, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8539 + }, + { + "epoch": 6.821086261980831, + "grad_norm": 0.12306738644838333, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8540 + }, + { + "epoch": 6.821884984025559, + "grad_norm": 0.09868071228265762, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8541 + }, + { + "epoch": 6.822683706070287, + "grad_norm": 0.09411876648664474, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8542 + }, + { + "epoch": 6.823482428115016, + "grad_norm": 0.09062112122774124, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8543 + }, + { + "epoch": 6.824281150159744, + "grad_norm": 0.14964330196380615, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8544 + }, + { + "epoch": 6.825079872204473, + "grad_norm": 0.1444161832332611, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8545 + }, + { + "epoch": 6.825878594249201, + "grad_norm": 0.15247556567192078, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8546 + }, + { + "epoch": 6.82667731629393, + "grad_norm": 0.1556181013584137, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8547 + }, + { + "epoch": 6.827476038338658, + "grad_norm": 0.1781637817621231, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8548 + }, + { + "epoch": 6.828274760383387, + "grad_norm": 0.10066398978233337, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8549 + }, + { + "epoch": 6.8290734824281145, + "grad_norm": 3.0298452377319336, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8550 + }, + { + "epoch": 6.829872204472844, + "grad_norm": 0.2745296061038971, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8551 + }, + { + "epoch": 6.830670926517572, + "grad_norm": 0.4030947983264923, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 8552 + }, + { + "epoch": 6.8314696485623, + "grad_norm": 0.11019638180732727, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8553 + }, + { + "epoch": 6.832268370607029, + "grad_norm": 0.33687886595726013, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8554 + }, + { + "epoch": 6.833067092651757, + "grad_norm": 0.164499431848526, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8555 + }, + { + "epoch": 6.833865814696486, + "grad_norm": 0.31624776124954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8556 + }, + { + "epoch": 6.834664536741214, + "grad_norm": 0.24264110624790192, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8557 + }, + { + "epoch": 6.835463258785943, + "grad_norm": 0.19310493767261505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8558 + }, + { + "epoch": 6.836261980830671, + "grad_norm": 0.2903575003147125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8559 + }, + { + "epoch": 6.8370607028754, + "grad_norm": 0.22584185004234314, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8560 + }, + { + "epoch": 6.837859424920127, + "grad_norm": 0.2400067150592804, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8561 + }, + { + "epoch": 6.838658146964856, + "grad_norm": 0.22543750703334808, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8562 + }, + { + "epoch": 6.8394568690095845, + "grad_norm": 0.2071310430765152, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8563 + }, + { + "epoch": 6.840255591054313, + "grad_norm": 0.07198980450630188, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8564 + }, + { + "epoch": 6.8410543130990416, + "grad_norm": 0.14733794331550598, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8565 + }, + { + "epoch": 6.84185303514377, + "grad_norm": 0.10259919613599777, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8566 + }, + { + "epoch": 6.842651757188499, + "grad_norm": 0.11961761116981506, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8567 + }, + { + "epoch": 6.843450479233227, + "grad_norm": 0.2714863121509552, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8568 + }, + { + "epoch": 6.844249201277956, + "grad_norm": 0.23675218224525452, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8569 + }, + { + "epoch": 6.845047923322683, + "grad_norm": 0.17738480865955353, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8570 + }, + { + "epoch": 6.845846645367412, + "grad_norm": 0.2558303475379944, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8571 + }, + { + "epoch": 6.84664536741214, + "grad_norm": 0.19869430363178253, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8572 + }, + { + "epoch": 6.847444089456869, + "grad_norm": 0.15806829929351807, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8573 + }, + { + "epoch": 6.848242811501597, + "grad_norm": 0.12016306072473526, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8574 + }, + { + "epoch": 6.849041533546326, + "grad_norm": 0.10831576585769653, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8575 + }, + { + "epoch": 6.8498402555910545, + "grad_norm": 0.06762730330228806, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8576 + }, + { + "epoch": 6.850638977635783, + "grad_norm": 0.0824534222483635, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8577 + }, + { + "epoch": 6.8514376996805115, + "grad_norm": 0.20734307169914246, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8578 + }, + { + "epoch": 6.852236421725239, + "grad_norm": 0.22174668312072754, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8579 + }, + { + "epoch": 6.853035143769968, + "grad_norm": 0.05667027458548546, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8580 + }, + { + "epoch": 6.853833865814696, + "grad_norm": 0.2844708561897278, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8581 + }, + { + "epoch": 6.854632587859425, + "grad_norm": 0.21092848479747772, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8582 + }, + { + "epoch": 6.855431309904153, + "grad_norm": 0.08843044936656952, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8583 + }, + { + "epoch": 6.856230031948882, + "grad_norm": 0.08862966299057007, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8584 + }, + { + "epoch": 6.85702875399361, + "grad_norm": 0.13263291120529175, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8585 + }, + { + "epoch": 6.857827476038339, + "grad_norm": 0.1969175636768341, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8586 + }, + { + "epoch": 6.858626198083067, + "grad_norm": 0.1299106925725937, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8587 + }, + { + "epoch": 6.859424920127795, + "grad_norm": 0.058154329657554626, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8588 + }, + { + "epoch": 6.8602236421725244, + "grad_norm": 0.06485166400671005, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8589 + }, + { + "epoch": 6.861022364217252, + "grad_norm": 6.880006313323975, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8590 + }, + { + "epoch": 6.861821086261981, + "grad_norm": 0.09929946064949036, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8591 + }, + { + "epoch": 6.862619808306709, + "grad_norm": 0.11197477579116821, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8592 + }, + { + "epoch": 6.863418530351438, + "grad_norm": 0.06740657985210419, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8593 + }, + { + "epoch": 6.864217252396166, + "grad_norm": 0.19594676792621613, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8594 + }, + { + "epoch": 6.865015974440895, + "grad_norm": 0.16844215989112854, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8595 + }, + { + "epoch": 6.865814696485623, + "grad_norm": 0.08980540931224823, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8596 + }, + { + "epoch": 6.866613418530352, + "grad_norm": 0.1263660043478012, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8597 + }, + { + "epoch": 6.86741214057508, + "grad_norm": 0.2000604271888733, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8598 + }, + { + "epoch": 6.868210862619808, + "grad_norm": 0.08987699449062347, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8599 + }, + { + "epoch": 6.8690095846645365, + "grad_norm": 0.12263453006744385, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8600 + }, + { + "epoch": 6.869808306709265, + "grad_norm": 0.1567721962928772, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8601 + }, + { + "epoch": 6.8706070287539935, + "grad_norm": 0.08756576478481293, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8602 + }, + { + "epoch": 6.871405750798722, + "grad_norm": 0.11816724389791489, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8603 + }, + { + "epoch": 6.872204472843451, + "grad_norm": 0.13798843324184418, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8604 + }, + { + "epoch": 6.873003194888179, + "grad_norm": 0.12364917248487473, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8605 + }, + { + "epoch": 6.873801916932908, + "grad_norm": 0.1200469508767128, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8606 + }, + { + "epoch": 6.874600638977636, + "grad_norm": 0.12144476920366287, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8607 + }, + { + "epoch": 6.875399361022364, + "grad_norm": 0.20083829760551453, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8608 + }, + { + "epoch": 6.876198083067092, + "grad_norm": 0.2817170023918152, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8609 + }, + { + "epoch": 6.876996805111821, + "grad_norm": 0.12137018889188766, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 8610 + }, + { + "epoch": 6.877795527156549, + "grad_norm": 0.09903489053249359, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8611 + }, + { + "epoch": 6.878594249201278, + "grad_norm": 0.17958515882492065, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8612 + }, + { + "epoch": 6.8793929712460065, + "grad_norm": 0.1041099801659584, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8613 + }, + { + "epoch": 6.880191693290735, + "grad_norm": 0.16099892556667328, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8614 + }, + { + "epoch": 6.8809904153354635, + "grad_norm": 0.061900194734334946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8615 + }, + { + "epoch": 6.881789137380192, + "grad_norm": 0.1341199427843094, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8616 + }, + { + "epoch": 6.88258785942492, + "grad_norm": 0.12683184444904327, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8617 + }, + { + "epoch": 6.883386581469648, + "grad_norm": 0.08566799014806747, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8618 + }, + { + "epoch": 6.884185303514377, + "grad_norm": 0.1616903841495514, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8619 + }, + { + "epoch": 6.884984025559105, + "grad_norm": 0.05832672119140625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8620 + }, + { + "epoch": 6.885782747603834, + "grad_norm": 0.15186071395874023, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8621 + }, + { + "epoch": 6.886581469648562, + "grad_norm": 0.16585935652256012, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8622 + }, + { + "epoch": 6.887380191693291, + "grad_norm": 0.1267954260110855, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8623 + }, + { + "epoch": 6.888178913738019, + "grad_norm": 0.22396692633628845, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8624 + }, + { + "epoch": 6.888977635782748, + "grad_norm": 0.133334219455719, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8625 + }, + { + "epoch": 6.8897763578274756, + "grad_norm": 0.1935819834470749, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8626 + }, + { + "epoch": 6.890575079872205, + "grad_norm": 0.32829585671424866, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8627 + }, + { + "epoch": 6.891373801916933, + "grad_norm": 0.231554314494133, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8628 + }, + { + "epoch": 6.892172523961661, + "grad_norm": 0.20693574845790863, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8629 + }, + { + "epoch": 6.89297124600639, + "grad_norm": 0.21037861704826355, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8630 + }, + { + "epoch": 6.893769968051118, + "grad_norm": 0.051133595407009125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8631 + }, + { + "epoch": 6.894568690095847, + "grad_norm": 0.17635062336921692, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8632 + }, + { + "epoch": 6.895367412140575, + "grad_norm": 0.14592808485031128, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8633 + }, + { + "epoch": 6.896166134185304, + "grad_norm": 0.15353697538375854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8634 + }, + { + "epoch": 6.896964856230032, + "grad_norm": 0.19556251168251038, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8635 + }, + { + "epoch": 6.897763578274761, + "grad_norm": 0.06867649406194687, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8636 + }, + { + "epoch": 6.8985623003194885, + "grad_norm": 0.15286169946193695, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8637 + }, + { + "epoch": 6.899361022364217, + "grad_norm": 0.28361746668815613, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8638 + }, + { + "epoch": 6.9001597444089455, + "grad_norm": 0.09351217746734619, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8639 + }, + { + "epoch": 6.900958466453674, + "grad_norm": 0.11050279438495636, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8640 + }, + { + "epoch": 6.901757188498403, + "grad_norm": 0.1648218333721161, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8641 + }, + { + "epoch": 6.902555910543131, + "grad_norm": 0.10323848575353622, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8642 + }, + { + "epoch": 6.90335463258786, + "grad_norm": 0.14925505220890045, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8643 + }, + { + "epoch": 6.904153354632588, + "grad_norm": 0.05877414718270302, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8644 + }, + { + "epoch": 6.904952076677317, + "grad_norm": 0.3324354290962219, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8645 + }, + { + "epoch": 6.905750798722044, + "grad_norm": 0.22756889462471008, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8646 + }, + { + "epoch": 6.906549520766773, + "grad_norm": 0.1040947288274765, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8647 + }, + { + "epoch": 6.907348242811501, + "grad_norm": 0.1310190111398697, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8648 + }, + { + "epoch": 6.90814696485623, + "grad_norm": 0.09484609216451645, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8649 + }, + { + "epoch": 6.9089456869009584, + "grad_norm": 0.13337384164333344, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8650 + }, + { + "epoch": 6.909744408945687, + "grad_norm": 0.31157273054122925, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8651 + }, + { + "epoch": 6.9105431309904155, + "grad_norm": 0.15081669390201569, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8652 + }, + { + "epoch": 6.911341853035144, + "grad_norm": 0.14120221138000488, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8653 + }, + { + "epoch": 6.912140575079873, + "grad_norm": 0.6128084659576416, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8654 + }, + { + "epoch": 6.9129392971246, + "grad_norm": 0.6915252208709717, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 8655 + }, + { + "epoch": 6.913738019169329, + "grad_norm": 0.7245156168937683, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 8656 + }, + { + "epoch": 6.914536741214057, + "grad_norm": 0.8400923013687134, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 8657 + }, + { + "epoch": 6.915335463258786, + "grad_norm": 0.3218044340610504, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 8658 + }, + { + "epoch": 6.916134185303514, + "grad_norm": 0.3119533061981201, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 8659 + }, + { + "epoch": 6.916932907348243, + "grad_norm": 0.2192138433456421, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 8660 + }, + { + "epoch": 6.917731629392971, + "grad_norm": 0.36212611198425293, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 8661 + }, + { + "epoch": 6.9185303514377, + "grad_norm": 0.13674713671207428, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8662 + }, + { + "epoch": 6.919329073482428, + "grad_norm": 0.24960070848464966, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 8663 + }, + { + "epoch": 6.920127795527156, + "grad_norm": 0.16797062754631042, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8664 + }, + { + "epoch": 6.9209265175718855, + "grad_norm": 0.23811157047748566, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 8665 + }, + { + "epoch": 6.921725239616613, + "grad_norm": 0.25372570753097534, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8666 + }, + { + "epoch": 6.922523961661342, + "grad_norm": 0.13954615592956543, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8667 + }, + { + "epoch": 6.92332268370607, + "grad_norm": 0.17769959568977356, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8668 + }, + { + "epoch": 6.924121405750799, + "grad_norm": 0.14327546954154968, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 8669 + }, + { + "epoch": 6.924920127795527, + "grad_norm": 0.07454083859920502, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 8670 + }, + { + "epoch": 6.925718849840256, + "grad_norm": 0.18561266362667084, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8671 + }, + { + "epoch": 6.926517571884984, + "grad_norm": 0.11927005648612976, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 8672 + }, + { + "epoch": 6.927316293929713, + "grad_norm": 0.06790865212678909, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8673 + }, + { + "epoch": 6.928115015974441, + "grad_norm": 0.22627630829811096, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8674 + }, + { + "epoch": 6.928913738019169, + "grad_norm": 0.21341092884540558, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8675 + }, + { + "epoch": 6.9297124600638975, + "grad_norm": 0.19292457401752472, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 8676 + }, + { + "epoch": 6.930511182108626, + "grad_norm": 0.15046356618404388, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8677 + }, + { + "epoch": 6.931309904153355, + "grad_norm": 0.13845203816890717, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8678 + }, + { + "epoch": 6.932108626198083, + "grad_norm": 0.18034739792346954, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8679 + }, + { + "epoch": 6.932907348242812, + "grad_norm": 0.3970269560813904, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8680 + }, + { + "epoch": 6.93370607028754, + "grad_norm": 0.133075550198555, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8681 + }, + { + "epoch": 6.934504792332269, + "grad_norm": 0.13149690628051758, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 8682 + }, + { + "epoch": 6.935303514376997, + "grad_norm": 0.1332010179758072, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8683 + }, + { + "epoch": 6.936102236421725, + "grad_norm": 0.13125883042812347, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 8684 + }, + { + "epoch": 6.936900958466453, + "grad_norm": 0.5500382781028748, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 8685 + }, + { + "epoch": 6.937699680511182, + "grad_norm": 0.09766851365566254, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8686 + }, + { + "epoch": 6.93849840255591, + "grad_norm": 0.10732626169919968, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8687 + }, + { + "epoch": 6.939297124600639, + "grad_norm": 0.10059154033660889, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8688 + }, + { + "epoch": 6.9400958466453675, + "grad_norm": 0.09518695622682571, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8689 + }, + { + "epoch": 6.940894568690096, + "grad_norm": 0.1279720813035965, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8690 + }, + { + "epoch": 6.9416932907348246, + "grad_norm": 0.0997946485877037, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8691 + }, + { + "epoch": 6.942492012779553, + "grad_norm": 0.08584152907133102, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8692 + }, + { + "epoch": 6.943290734824281, + "grad_norm": 0.06987651437520981, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8693 + }, + { + "epoch": 6.944089456869009, + "grad_norm": 0.10446512699127197, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8694 + }, + { + "epoch": 6.944888178913738, + "grad_norm": 0.08535288274288177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8695 + }, + { + "epoch": 6.945686900958466, + "grad_norm": 0.15912187099456787, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8696 + }, + { + "epoch": 6.946485623003195, + "grad_norm": 0.20139484107494354, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8697 + }, + { + "epoch": 6.947284345047923, + "grad_norm": 0.10153409093618393, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8698 + }, + { + "epoch": 6.948083067092652, + "grad_norm": 0.04925902560353279, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8699 + }, + { + "epoch": 6.94888178913738, + "grad_norm": 0.13896742463111877, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8700 + }, + { + "epoch": 6.949680511182109, + "grad_norm": 0.07297761738300323, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8701 + }, + { + "epoch": 6.950479233226837, + "grad_norm": 0.09260845929384232, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8702 + }, + { + "epoch": 6.951277955271565, + "grad_norm": 0.11840535700321198, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8703 + }, + { + "epoch": 6.952076677316294, + "grad_norm": 0.17365501821041107, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8704 + }, + { + "epoch": 6.952875399361022, + "grad_norm": 0.1369183212518692, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8705 + }, + { + "epoch": 6.953674121405751, + "grad_norm": 0.11277196556329727, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8706 + }, + { + "epoch": 6.954472843450479, + "grad_norm": 0.11032512784004211, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8707 + }, + { + "epoch": 6.955271565495208, + "grad_norm": 0.12437347322702408, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8708 + }, + { + "epoch": 6.956070287539936, + "grad_norm": 0.08772306144237518, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8709 + }, + { + "epoch": 6.956869009584665, + "grad_norm": 0.05245213583111763, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8710 + }, + { + "epoch": 6.957667731629393, + "grad_norm": 0.1591174304485321, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8711 + }, + { + "epoch": 6.958466453674122, + "grad_norm": 0.21121510863304138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8712 + }, + { + "epoch": 6.9592651757188495, + "grad_norm": 0.11379709839820862, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8713 + }, + { + "epoch": 6.960063897763578, + "grad_norm": 0.10083793848752975, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8714 + }, + { + "epoch": 6.960862619808307, + "grad_norm": 0.0790674164891243, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8715 + }, + { + "epoch": 6.961661341853035, + "grad_norm": 0.13917089998722076, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8716 + }, + { + "epoch": 6.962460063897764, + "grad_norm": 0.18794408440589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8717 + }, + { + "epoch": 6.963258785942492, + "grad_norm": 0.10725098103284836, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8718 + }, + { + "epoch": 6.964057507987221, + "grad_norm": 0.14577186107635498, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8719 + }, + { + "epoch": 6.964856230031949, + "grad_norm": 0.06711703538894653, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8720 + }, + { + "epoch": 6.965654952076678, + "grad_norm": 0.20572635531425476, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8721 + }, + { + "epoch": 6.966453674121405, + "grad_norm": 0.13693936169147491, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8722 + }, + { + "epoch": 6.967252396166134, + "grad_norm": 0.05642275512218475, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8723 + }, + { + "epoch": 6.968051118210862, + "grad_norm": 0.09080768376588821, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8724 + }, + { + "epoch": 6.968849840255591, + "grad_norm": 0.05295126140117645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8725 + }, + { + "epoch": 6.9696485623003195, + "grad_norm": 0.11833932250738144, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8726 + }, + { + "epoch": 6.970447284345048, + "grad_norm": 0.12110085785388947, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8727 + }, + { + "epoch": 6.9712460063897765, + "grad_norm": 0.10044527053833008, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8728 + }, + { + "epoch": 6.972044728434505, + "grad_norm": 0.13638640940189362, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8729 + }, + { + "epoch": 6.972843450479234, + "grad_norm": 0.18118594586849213, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8730 + }, + { + "epoch": 6.973642172523961, + "grad_norm": 0.1394396871328354, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8731 + }, + { + "epoch": 6.97444089456869, + "grad_norm": 0.14276480674743652, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8732 + }, + { + "epoch": 6.975239616613418, + "grad_norm": 0.2213817834854126, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8733 + }, + { + "epoch": 6.976038338658147, + "grad_norm": 0.11497826874256134, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8734 + }, + { + "epoch": 6.976837060702875, + "grad_norm": 0.11436138302087784, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8735 + }, + { + "epoch": 6.977635782747604, + "grad_norm": 0.08433762192726135, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 8736 + }, + { + "epoch": 6.978434504792332, + "grad_norm": 0.1584242880344391, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8737 + }, + { + "epoch": 6.979233226837061, + "grad_norm": 0.09111067652702332, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8738 + }, + { + "epoch": 6.9800319488817895, + "grad_norm": 0.09075064212083817, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8739 + }, + { + "epoch": 6.980830670926517, + "grad_norm": 0.08456333726644516, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8740 + }, + { + "epoch": 6.981629392971246, + "grad_norm": 0.08090690523386002, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8741 + }, + { + "epoch": 6.982428115015974, + "grad_norm": 0.42019179463386536, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8742 + }, + { + "epoch": 6.983226837060703, + "grad_norm": 0.119536854326725, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8743 + }, + { + "epoch": 6.984025559105431, + "grad_norm": 0.08138761669397354, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8744 + }, + { + "epoch": 6.98482428115016, + "grad_norm": 0.5337278246879578, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8745 + }, + { + "epoch": 6.985623003194888, + "grad_norm": 0.1773308366537094, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8746 + }, + { + "epoch": 6.986421725239617, + "grad_norm": 0.10939478129148483, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8747 + }, + { + "epoch": 6.987220447284345, + "grad_norm": 0.18635793030261993, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8748 + }, + { + "epoch": 6.988019169329074, + "grad_norm": 0.11675454676151276, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8749 + }, + { + "epoch": 6.988817891373802, + "grad_norm": 0.11787068843841553, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8750 + }, + { + "epoch": 6.98961661341853, + "grad_norm": 0.2457057386636734, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8751 + }, + { + "epoch": 6.9904153354632586, + "grad_norm": 0.05914906784892082, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 8752 + }, + { + "epoch": 6.991214057507987, + "grad_norm": 0.1494094878435135, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8753 + }, + { + "epoch": 6.992012779552716, + "grad_norm": 0.14485910534858704, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8754 + }, + { + "epoch": 6.992811501597444, + "grad_norm": 1.2348047494888306, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8755 + }, + { + "epoch": 6.993610223642173, + "grad_norm": 0.1546175330877304, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8756 + }, + { + "epoch": 6.994408945686901, + "grad_norm": 0.13474640250205994, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8757 + }, + { + "epoch": 6.99520766773163, + "grad_norm": 0.5535407662391663, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8758 + }, + { + "epoch": 6.996006389776358, + "grad_norm": 0.10516832023859024, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8759 + }, + { + "epoch": 6.996805111821086, + "grad_norm": 0.07872752100229263, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8760 + }, + { + "epoch": 6.997603833865814, + "grad_norm": 0.08130715042352676, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8761 + }, + { + "epoch": 6.998402555910543, + "grad_norm": 0.09496142715215683, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8762 + }, + { + "epoch": 6.9992012779552715, + "grad_norm": 0.06645053625106812, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8763 + }, + { + "epoch": 7.0, + "grad_norm": 0.07332758605480194, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8764 + }, + { + "epoch": 7.0007987220447285, + "grad_norm": 0.09108536690473557, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8765 + }, + { + "epoch": 7.001597444089457, + "grad_norm": 0.13202883303165436, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8766 + }, + { + "epoch": 7.002396166134186, + "grad_norm": 0.09079252928495407, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 8767 + }, + { + "epoch": 7.003194888178914, + "grad_norm": 0.1004822626709938, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8768 + }, + { + "epoch": 7.003993610223642, + "grad_norm": 0.05096781253814697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8769 + }, + { + "epoch": 7.00479233226837, + "grad_norm": 0.14213396608829498, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8770 + }, + { + "epoch": 7.005591054313099, + "grad_norm": 0.11614344269037247, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8771 + }, + { + "epoch": 7.006389776357827, + "grad_norm": 0.1144147664308548, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8772 + }, + { + "epoch": 7.007188498402556, + "grad_norm": 0.1504330188035965, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8773 + }, + { + "epoch": 7.007987220447284, + "grad_norm": 0.10443079471588135, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8774 + }, + { + "epoch": 7.008785942492013, + "grad_norm": 0.166890949010849, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8775 + }, + { + "epoch": 7.0095846645367414, + "grad_norm": 0.12496565282344818, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8776 + }, + { + "epoch": 7.01038338658147, + "grad_norm": 0.12851381301879883, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8777 + }, + { + "epoch": 7.0111821086261985, + "grad_norm": 0.20198717713356018, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8778 + }, + { + "epoch": 7.011980830670926, + "grad_norm": 0.10324864089488983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8779 + }, + { + "epoch": 7.012779552715655, + "grad_norm": 0.12864094972610474, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8780 + }, + { + "epoch": 7.013578274760383, + "grad_norm": 0.11301549524068832, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8781 + }, + { + "epoch": 7.014376996805112, + "grad_norm": 0.13162367045879364, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8782 + }, + { + "epoch": 7.01517571884984, + "grad_norm": 0.1574760377407074, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8783 + }, + { + "epoch": 7.015974440894569, + "grad_norm": 0.07471634447574615, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8784 + }, + { + "epoch": 7.016773162939297, + "grad_norm": 0.09653516113758087, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8785 + }, + { + "epoch": 7.017571884984026, + "grad_norm": 0.13719993829727173, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8786 + }, + { + "epoch": 7.018370607028754, + "grad_norm": 0.10545443743467331, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8787 + }, + { + "epoch": 7.019169329073482, + "grad_norm": 0.1147511675953865, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8788 + }, + { + "epoch": 7.0199680511182105, + "grad_norm": 0.14005234837532043, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8789 + }, + { + "epoch": 7.020766773162939, + "grad_norm": 0.36956554651260376, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8790 + }, + { + "epoch": 7.021565495207668, + "grad_norm": 0.1384177953004837, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8791 + }, + { + "epoch": 7.022364217252396, + "grad_norm": 0.062106356024742126, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8792 + }, + { + "epoch": 7.023162939297125, + "grad_norm": 0.14074385166168213, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8793 + }, + { + "epoch": 7.023961661341853, + "grad_norm": 0.18152809143066406, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8794 + }, + { + "epoch": 7.024760383386582, + "grad_norm": 0.11607832461595535, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8795 + }, + { + "epoch": 7.02555910543131, + "grad_norm": 0.06603241711854935, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8796 + }, + { + "epoch": 7.026357827476039, + "grad_norm": 0.08846289664506912, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8797 + }, + { + "epoch": 7.027156549520766, + "grad_norm": 0.09882134944200516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8798 + }, + { + "epoch": 7.027955271565495, + "grad_norm": 0.11535032093524933, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8799 + }, + { + "epoch": 7.0287539936102235, + "grad_norm": 0.10153281688690186, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8800 + }, + { + "epoch": 7.029552715654952, + "grad_norm": 0.11195418983697891, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8801 + }, + { + "epoch": 7.0303514376996805, + "grad_norm": 0.5721603035926819, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8802 + }, + { + "epoch": 7.031150159744409, + "grad_norm": 0.18006286025047302, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8803 + }, + { + "epoch": 7.031948881789138, + "grad_norm": 0.16561086475849152, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8804 + }, + { + "epoch": 7.032747603833866, + "grad_norm": 0.11010444164276123, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8805 + }, + { + "epoch": 7.033546325878595, + "grad_norm": 0.17741475999355316, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8806 + }, + { + "epoch": 7.034345047923322, + "grad_norm": 0.09941161423921585, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 8807 + }, + { + "epoch": 7.035143769968051, + "grad_norm": 0.20474617183208466, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8808 + }, + { + "epoch": 7.035942492012779, + "grad_norm": 0.07972154021263123, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8809 + }, + { + "epoch": 7.036741214057508, + "grad_norm": 0.17856109142303467, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8810 + }, + { + "epoch": 7.037539936102236, + "grad_norm": 0.1276514083147049, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8811 + }, + { + "epoch": 7.038338658146965, + "grad_norm": 0.08009849488735199, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 8812 + }, + { + "epoch": 7.039137380191693, + "grad_norm": 0.09832913428544998, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8813 + }, + { + "epoch": 7.039936102236422, + "grad_norm": 0.06454402953386307, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8814 + }, + { + "epoch": 7.0407348242811505, + "grad_norm": 0.20843401551246643, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8815 + }, + { + "epoch": 7.041533546325879, + "grad_norm": 0.14909301698207855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8816 + }, + { + "epoch": 7.042332268370607, + "grad_norm": 0.08815812319517136, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8817 + }, + { + "epoch": 7.043130990415335, + "grad_norm": 0.18957766890525818, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8818 + }, + { + "epoch": 7.043929712460064, + "grad_norm": 0.33018213510513306, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8819 + }, + { + "epoch": 7.044728434504792, + "grad_norm": 0.11069374531507492, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8820 + }, + { + "epoch": 7.045527156549521, + "grad_norm": 0.3001084625720978, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8821 + }, + { + "epoch": 7.046325878594249, + "grad_norm": 0.0704922303557396, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8822 + }, + { + "epoch": 7.047124600638978, + "grad_norm": 0.08537211269140244, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8823 + }, + { + "epoch": 7.047923322683706, + "grad_norm": 0.08765899389982224, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8824 + }, + { + "epoch": 7.048722044728435, + "grad_norm": 0.14218255877494812, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8825 + }, + { + "epoch": 7.0495207667731625, + "grad_norm": 0.08026671409606934, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8826 + }, + { + "epoch": 7.050319488817891, + "grad_norm": 0.07170549035072327, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8827 + }, + { + "epoch": 7.05111821086262, + "grad_norm": 1.2578401565551758, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8828 + }, + { + "epoch": 7.051916932907348, + "grad_norm": 0.20149891078472137, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8829 + }, + { + "epoch": 7.052715654952077, + "grad_norm": 0.18734677135944366, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8830 + }, + { + "epoch": 7.053514376996805, + "grad_norm": 0.08732877671718597, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8831 + }, + { + "epoch": 7.054313099041534, + "grad_norm": 0.1895754486322403, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8832 + }, + { + "epoch": 7.055111821086262, + "grad_norm": 0.06839644908905029, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8833 + }, + { + "epoch": 7.055910543130991, + "grad_norm": 4.666222095489502, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8834 + }, + { + "epoch": 7.056709265175719, + "grad_norm": 0.2801821231842041, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8835 + }, + { + "epoch": 7.057507987220447, + "grad_norm": 0.3428499102592468, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8836 + }, + { + "epoch": 7.0583067092651754, + "grad_norm": 0.16896478831768036, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8837 + }, + { + "epoch": 7.059105431309904, + "grad_norm": 1.21062171459198, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8838 + }, + { + "epoch": 7.0599041533546325, + "grad_norm": 0.20507270097732544, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8839 + }, + { + "epoch": 7.060702875399361, + "grad_norm": 0.34736308455467224, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8840 + }, + { + "epoch": 7.06150159744409, + "grad_norm": 0.13628798723220825, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8841 + }, + { + "epoch": 7.062300319488818, + "grad_norm": 0.3212411403656006, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8842 + }, + { + "epoch": 7.063099041533547, + "grad_norm": 0.23049144446849823, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8843 + }, + { + "epoch": 7.063897763578275, + "grad_norm": 0.2785285413265228, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8844 + }, + { + "epoch": 7.064696485623003, + "grad_norm": 0.32158368825912476, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8845 + }, + { + "epoch": 7.065495207667731, + "grad_norm": 0.40443500876426697, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8846 + }, + { + "epoch": 7.06629392971246, + "grad_norm": 0.20072752237319946, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8847 + }, + { + "epoch": 7.067092651757188, + "grad_norm": 0.38166266679763794, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8848 + }, + { + "epoch": 7.067891373801917, + "grad_norm": 0.2771472930908203, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8849 + }, + { + "epoch": 7.068690095846645, + "grad_norm": 0.10485964268445969, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8850 + }, + { + "epoch": 7.069488817891374, + "grad_norm": 0.17424215376377106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8851 + }, + { + "epoch": 7.0702875399361025, + "grad_norm": 0.0972314327955246, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8852 + }, + { + "epoch": 7.071086261980831, + "grad_norm": 0.18021832406520844, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8853 + }, + { + "epoch": 7.0718849840255595, + "grad_norm": 0.08820143342018127, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8854 + }, + { + "epoch": 7.072683706070287, + "grad_norm": 0.1785898506641388, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8855 + }, + { + "epoch": 7.073482428115016, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8856 + }, + { + "epoch": 7.074281150159744, + "grad_norm": 0.1787438541650772, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8857 + }, + { + "epoch": 7.075079872204473, + "grad_norm": 0.16761353611946106, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8858 + }, + { + "epoch": 7.075878594249201, + "grad_norm": 0.5075165629386902, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8859 + }, + { + "epoch": 7.07667731629393, + "grad_norm": 0.13462364673614502, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8860 + }, + { + "epoch": 7.077476038338658, + "grad_norm": 0.20478707551956177, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8861 + }, + { + "epoch": 7.078274760383387, + "grad_norm": 0.14689947664737701, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8862 + }, + { + "epoch": 7.079073482428115, + "grad_norm": 0.36265847086906433, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8863 + }, + { + "epoch": 7.079872204472843, + "grad_norm": 0.18443043529987335, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8864 + }, + { + "epoch": 7.080670926517572, + "grad_norm": 0.04789111018180847, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8865 + }, + { + "epoch": 7.0814696485623, + "grad_norm": 0.18024222552776337, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8866 + }, + { + "epoch": 7.082268370607029, + "grad_norm": 0.08901690691709518, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8867 + }, + { + "epoch": 7.083067092651757, + "grad_norm": 0.20689153671264648, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8868 + }, + { + "epoch": 7.083865814696486, + "grad_norm": 0.15572768449783325, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8869 + }, + { + "epoch": 7.084664536741214, + "grad_norm": 0.2915050685405731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8870 + }, + { + "epoch": 7.085463258785943, + "grad_norm": 0.12404290586709976, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8871 + }, + { + "epoch": 7.086261980830671, + "grad_norm": 0.19628335535526276, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8872 + }, + { + "epoch": 7.0870607028754, + "grad_norm": 0.6693617105484009, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8873 + }, + { + "epoch": 7.087859424920127, + "grad_norm": 0.21526481211185455, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8874 + }, + { + "epoch": 7.088658146964856, + "grad_norm": 0.2779954969882965, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8875 + }, + { + "epoch": 7.0894568690095845, + "grad_norm": 0.14111320674419403, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8876 + }, + { + "epoch": 7.090255591054313, + "grad_norm": 0.26465079188346863, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8877 + }, + { + "epoch": 7.0910543130990416, + "grad_norm": 0.12354349344968796, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8878 + }, + { + "epoch": 7.09185303514377, + "grad_norm": 0.18360896408557892, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8879 + }, + { + "epoch": 7.092651757188499, + "grad_norm": 0.26844218373298645, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8880 + }, + { + "epoch": 7.093450479233227, + "grad_norm": 0.34032055735588074, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8881 + }, + { + "epoch": 7.094249201277956, + "grad_norm": 0.2372630089521408, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8882 + }, + { + "epoch": 7.095047923322683, + "grad_norm": 0.4134571850299835, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8883 + }, + { + "epoch": 7.095846645367412, + "grad_norm": 0.21220949292182922, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8884 + }, + { + "epoch": 7.09664536741214, + "grad_norm": 0.20073527097702026, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8885 + }, + { + "epoch": 7.097444089456869, + "grad_norm": 0.1583309918642044, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8886 + }, + { + "epoch": 7.098242811501597, + "grad_norm": 0.4032151401042938, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8887 + }, + { + "epoch": 7.099041533546326, + "grad_norm": 0.09527560323476791, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 8888 + }, + { + "epoch": 7.0998402555910545, + "grad_norm": 0.2630043625831604, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8889 + }, + { + "epoch": 7.100638977635783, + "grad_norm": 0.06699138134717941, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8890 + }, + { + "epoch": 7.1014376996805115, + "grad_norm": 0.34307003021240234, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8891 + }, + { + "epoch": 7.102236421725239, + "grad_norm": 0.24538451433181763, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8892 + }, + { + "epoch": 7.103035143769968, + "grad_norm": 0.2794513702392578, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8893 + }, + { + "epoch": 7.103833865814696, + "grad_norm": 0.20586012303829193, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8894 + }, + { + "epoch": 7.104632587859425, + "grad_norm": 0.22349807620048523, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8895 + }, + { + "epoch": 7.105431309904153, + "grad_norm": 0.31171584129333496, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8896 + }, + { + "epoch": 7.106230031948882, + "grad_norm": 0.07461030781269073, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8897 + }, + { + "epoch": 7.10702875399361, + "grad_norm": 0.24280597269535065, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8898 + }, + { + "epoch": 7.107827476038339, + "grad_norm": 0.13005708158016205, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8899 + }, + { + "epoch": 7.108626198083067, + "grad_norm": 0.24730080366134644, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8900 + }, + { + "epoch": 7.109424920127796, + "grad_norm": 1.287341833114624, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8901 + }, + { + "epoch": 7.110223642172524, + "grad_norm": 0.15945735573768616, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8902 + }, + { + "epoch": 7.111022364217252, + "grad_norm": 0.09943541884422302, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8903 + }, + { + "epoch": 7.111821086261981, + "grad_norm": 0.12183468043804169, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8904 + }, + { + "epoch": 7.112619808306709, + "grad_norm": 0.11859191954135895, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8905 + }, + { + "epoch": 7.113418530351438, + "grad_norm": 0.27701425552368164, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8906 + }, + { + "epoch": 7.114217252396166, + "grad_norm": 0.14724725484848022, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8907 + }, + { + "epoch": 7.115015974440895, + "grad_norm": 0.1342400461435318, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8908 + }, + { + "epoch": 7.115814696485623, + "grad_norm": 0.15474970638751984, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8909 + }, + { + "epoch": 7.116613418530352, + "grad_norm": 0.1276721954345703, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8910 + }, + { + "epoch": 7.11741214057508, + "grad_norm": 0.14511124789714813, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8911 + }, + { + "epoch": 7.118210862619808, + "grad_norm": 0.10112027823925018, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8912 + }, + { + "epoch": 7.1190095846645365, + "grad_norm": 0.17296795547008514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8913 + }, + { + "epoch": 7.119808306709265, + "grad_norm": 0.09542828798294067, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8914 + }, + { + "epoch": 7.1206070287539935, + "grad_norm": 0.17453183233737946, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8915 + }, + { + "epoch": 7.121405750798722, + "grad_norm": 0.13417603075504303, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8916 + }, + { + "epoch": 7.122204472843451, + "grad_norm": 0.26239508390426636, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8917 + }, + { + "epoch": 7.123003194888179, + "grad_norm": 0.13963834941387177, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8918 + }, + { + "epoch": 7.123801916932908, + "grad_norm": 0.18642054498195648, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8919 + }, + { + "epoch": 7.124600638977636, + "grad_norm": 0.17754590511322021, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8920 + }, + { + "epoch": 7.125399361022364, + "grad_norm": 0.1010628268122673, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8921 + }, + { + "epoch": 7.126198083067092, + "grad_norm": 0.1621905416250229, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8922 + }, + { + "epoch": 7.126996805111821, + "grad_norm": 0.3069966733455658, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8923 + }, + { + "epoch": 7.127795527156549, + "grad_norm": 0.2312333881855011, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8924 + }, + { + "epoch": 7.128594249201278, + "grad_norm": 0.20297785103321075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8925 + }, + { + "epoch": 7.1293929712460065, + "grad_norm": 0.18856601417064667, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8926 + }, + { + "epoch": 7.130191693290735, + "grad_norm": 0.19353985786437988, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8927 + }, + { + "epoch": 7.1309904153354635, + "grad_norm": 0.08276687562465668, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8928 + }, + { + "epoch": 7.131789137380192, + "grad_norm": 0.31372779607772827, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8929 + }, + { + "epoch": 7.13258785942492, + "grad_norm": 0.10208959877490997, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8930 + }, + { + "epoch": 7.133386581469648, + "grad_norm": 0.1636659801006317, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8931 + }, + { + "epoch": 7.134185303514377, + "grad_norm": 0.14321425557136536, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8932 + }, + { + "epoch": 7.134984025559105, + "grad_norm": 0.08438511192798615, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8933 + }, + { + "epoch": 7.135782747603834, + "grad_norm": 0.17451012134552002, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8934 + }, + { + "epoch": 7.136581469648562, + "grad_norm": 0.06913795322179794, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8935 + }, + { + "epoch": 7.137380191693291, + "grad_norm": 0.14176666736602783, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8936 + }, + { + "epoch": 7.138178913738019, + "grad_norm": 0.15005643665790558, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8937 + }, + { + "epoch": 7.138977635782748, + "grad_norm": 0.08884457498788834, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8938 + }, + { + "epoch": 7.139776357827476, + "grad_norm": 0.19651612639427185, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8939 + }, + { + "epoch": 7.140575079872204, + "grad_norm": 0.12419132143259048, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8940 + }, + { + "epoch": 7.141373801916933, + "grad_norm": 0.08800125867128372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8941 + }, + { + "epoch": 7.142172523961661, + "grad_norm": 0.12308578193187714, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8942 + }, + { + "epoch": 7.14297124600639, + "grad_norm": 0.06376682221889496, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8943 + }, + { + "epoch": 7.143769968051118, + "grad_norm": 0.08467467129230499, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8944 + }, + { + "epoch": 7.144568690095847, + "grad_norm": 0.05492696538567543, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8945 + }, + { + "epoch": 7.145367412140575, + "grad_norm": 0.12659363448619843, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8946 + }, + { + "epoch": 7.146166134185304, + "grad_norm": 0.11025204509496689, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8947 + }, + { + "epoch": 7.146964856230032, + "grad_norm": 0.03672007843852043, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8948 + }, + { + "epoch": 7.147763578274761, + "grad_norm": 0.06386546790599823, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8949 + }, + { + "epoch": 7.1485623003194885, + "grad_norm": 0.05484751984477043, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8950 + }, + { + "epoch": 7.149361022364217, + "grad_norm": 0.08663280308246613, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8951 + }, + { + "epoch": 7.1501597444089455, + "grad_norm": 0.10515031963586807, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8952 + }, + { + "epoch": 7.150958466453674, + "grad_norm": 0.05844622105360031, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8953 + }, + { + "epoch": 7.151757188498403, + "grad_norm": 0.061575960367918015, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8954 + }, + { + "epoch": 7.152555910543131, + "grad_norm": 0.30169913172721863, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8955 + }, + { + "epoch": 7.15335463258786, + "grad_norm": 0.15433792769908905, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8956 + }, + { + "epoch": 7.154153354632588, + "grad_norm": 0.11872339993715286, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8957 + }, + { + "epoch": 7.154952076677317, + "grad_norm": 0.4086587131023407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8958 + }, + { + "epoch": 7.155750798722044, + "grad_norm": 0.0976172536611557, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8959 + }, + { + "epoch": 7.156549520766773, + "grad_norm": 0.11132699996232986, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8960 + }, + { + "epoch": 7.157348242811501, + "grad_norm": 0.11129645258188248, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8961 + }, + { + "epoch": 7.15814696485623, + "grad_norm": 0.09004200249910355, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8962 + }, + { + "epoch": 7.1589456869009584, + "grad_norm": 0.1225908026099205, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8963 + }, + { + "epoch": 7.159744408945687, + "grad_norm": 0.10531286895275116, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8964 + }, + { + "epoch": 7.1605431309904155, + "grad_norm": 0.1054515391588211, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8965 + }, + { + "epoch": 7.161341853035144, + "grad_norm": 0.11718834936618805, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8966 + }, + { + "epoch": 7.162140575079873, + "grad_norm": 0.11314168572425842, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8967 + }, + { + "epoch": 7.1629392971246, + "grad_norm": 0.1017487570643425, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8968 + }, + { + "epoch": 7.163738019169329, + "grad_norm": 0.05381032079458237, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8969 + }, + { + "epoch": 7.164536741214057, + "grad_norm": 0.1527879238128662, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8970 + }, + { + "epoch": 7.165335463258786, + "grad_norm": 0.05352415144443512, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8971 + }, + { + "epoch": 7.166134185303514, + "grad_norm": 0.17179784178733826, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8972 + }, + { + "epoch": 7.166932907348243, + "grad_norm": 0.24629469215869904, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8973 + }, + { + "epoch": 7.167731629392971, + "grad_norm": 0.11276146024465561, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8974 + }, + { + "epoch": 7.1685303514377, + "grad_norm": 0.0927032083272934, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8975 + }, + { + "epoch": 7.169329073482428, + "grad_norm": 0.0978626236319542, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8976 + }, + { + "epoch": 7.170127795527157, + "grad_norm": 0.12577946484088898, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8977 + }, + { + "epoch": 7.170926517571885, + "grad_norm": 0.1014678105711937, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8978 + }, + { + "epoch": 7.171725239616613, + "grad_norm": 0.08706190437078476, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8979 + }, + { + "epoch": 7.172523961661342, + "grad_norm": 0.06214338168501854, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8980 + }, + { + "epoch": 7.17332268370607, + "grad_norm": 0.08223161101341248, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8981 + }, + { + "epoch": 7.174121405750799, + "grad_norm": 0.3143157362937927, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8982 + }, + { + "epoch": 7.174920127795527, + "grad_norm": 0.16466212272644043, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8983 + }, + { + "epoch": 7.175718849840256, + "grad_norm": 0.13650043308734894, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8984 + }, + { + "epoch": 7.176517571884984, + "grad_norm": 0.05605694651603699, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8985 + }, + { + "epoch": 7.177316293929713, + "grad_norm": 0.12153269350528717, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8986 + }, + { + "epoch": 7.178115015974441, + "grad_norm": 0.07390844076871872, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8987 + }, + { + "epoch": 7.178913738019169, + "grad_norm": 0.05618416517972946, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8988 + }, + { + "epoch": 7.1797124600638975, + "grad_norm": 0.24178527295589447, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8989 + }, + { + "epoch": 7.180511182108626, + "grad_norm": 0.06414328515529633, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8990 + }, + { + "epoch": 7.181309904153355, + "grad_norm": 0.05483662337064743, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8991 + }, + { + "epoch": 7.182108626198083, + "grad_norm": 0.05821032077074051, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8992 + }, + { + "epoch": 7.182907348242812, + "grad_norm": 0.04972073435783386, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8993 + }, + { + "epoch": 7.18370607028754, + "grad_norm": 0.13323748111724854, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8994 + }, + { + "epoch": 7.184504792332269, + "grad_norm": 0.1341763287782669, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8995 + }, + { + "epoch": 7.185303514376997, + "grad_norm": 0.1092606782913208, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8996 + }, + { + "epoch": 7.186102236421725, + "grad_norm": 0.10611139982938766, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8997 + }, + { + "epoch": 7.186900958466453, + "grad_norm": 0.0810476616024971, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8998 + }, + { + "epoch": 7.187699680511182, + "grad_norm": 0.053938958793878555, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8999 + }, + { + "epoch": 7.18849840255591, + "grad_norm": 0.08355431258678436, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9000 + }, + { + "epoch": 7.189297124600639, + "grad_norm": 0.0719372034072876, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9001 + }, + { + "epoch": 7.1900958466453675, + "grad_norm": 0.0541183203458786, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9002 + }, + { + "epoch": 7.190894568690096, + "grad_norm": 0.08637872338294983, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9003 + }, + { + "epoch": 7.1916932907348246, + "grad_norm": 0.0900801345705986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9004 + }, + { + "epoch": 7.192492012779553, + "grad_norm": 0.08778835088014603, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9005 + }, + { + "epoch": 7.193290734824281, + "grad_norm": 0.13946911692619324, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9006 + }, + { + "epoch": 7.194089456869009, + "grad_norm": 0.20089952647686005, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9007 + }, + { + "epoch": 7.194888178913738, + "grad_norm": 0.20472672581672668, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 9008 + }, + { + "epoch": 7.195686900958466, + "grad_norm": 0.09503829479217529, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9009 + }, + { + "epoch": 7.196485623003195, + "grad_norm": 0.057289477437734604, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9010 + }, + { + "epoch": 7.197284345047923, + "grad_norm": 0.18998531997203827, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9011 + }, + { + "epoch": 7.198083067092652, + "grad_norm": 0.12228010594844818, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9012 + }, + { + "epoch": 7.19888178913738, + "grad_norm": 0.0855637639760971, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9013 + }, + { + "epoch": 7.199680511182109, + "grad_norm": 0.08341407775878906, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9014 + }, + { + "epoch": 7.2004792332268375, + "grad_norm": 0.06806697696447372, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9015 + }, + { + "epoch": 7.201277955271565, + "grad_norm": 0.06730692833662033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9016 + }, + { + "epoch": 7.202076677316294, + "grad_norm": 0.04983438923954964, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9017 + }, + { + "epoch": 7.202875399361022, + "grad_norm": 0.09153386205434799, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9018 + }, + { + "epoch": 7.203674121405751, + "grad_norm": 0.06117153540253639, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9019 + }, + { + "epoch": 7.204472843450479, + "grad_norm": 0.056790344417095184, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9020 + }, + { + "epoch": 7.205271565495208, + "grad_norm": 0.8241305351257324, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9021 + }, + { + "epoch": 7.206070287539936, + "grad_norm": 0.21823863685131073, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9022 + }, + { + "epoch": 7.206869009584665, + "grad_norm": 0.14799124002456665, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9023 + }, + { + "epoch": 7.207667731629393, + "grad_norm": 0.09815513342618942, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9024 + }, + { + "epoch": 7.208466453674121, + "grad_norm": 0.2076011300086975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9025 + }, + { + "epoch": 7.2092651757188495, + "grad_norm": 0.13652865588665009, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9026 + }, + { + "epoch": 7.210063897763578, + "grad_norm": 0.15180739760398865, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9027 + }, + { + "epoch": 7.210862619808307, + "grad_norm": 0.11385779827833176, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9028 + }, + { + "epoch": 7.211661341853035, + "grad_norm": 0.05047432705760002, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9029 + }, + { + "epoch": 7.212460063897764, + "grad_norm": 0.13789398968219757, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9030 + }, + { + "epoch": 7.213258785942492, + "grad_norm": 0.10509981215000153, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9031 + }, + { + "epoch": 7.214057507987221, + "grad_norm": 0.19650724530220032, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9032 + }, + { + "epoch": 7.214856230031949, + "grad_norm": 0.11788946390151978, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9033 + }, + { + "epoch": 7.215654952076678, + "grad_norm": 0.11023712903261185, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9034 + }, + { + "epoch": 7.216453674121405, + "grad_norm": 0.3382134735584259, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9035 + }, + { + "epoch": 7.217252396166134, + "grad_norm": 0.20465348660945892, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9036 + }, + { + "epoch": 7.218051118210862, + "grad_norm": 0.17456264793872833, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9037 + }, + { + "epoch": 7.218849840255591, + "grad_norm": 0.09034306555986404, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9038 + }, + { + "epoch": 7.2196485623003195, + "grad_norm": 0.15296493470668793, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9039 + }, + { + "epoch": 7.220447284345048, + "grad_norm": 0.1379650980234146, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9040 + }, + { + "epoch": 7.2212460063897765, + "grad_norm": 0.20932430028915405, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9041 + }, + { + "epoch": 7.222044728434505, + "grad_norm": 0.09309016168117523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9042 + }, + { + "epoch": 7.222843450479234, + "grad_norm": 0.13084891438484192, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9043 + }, + { + "epoch": 7.223642172523961, + "grad_norm": 0.1435803472995758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9044 + }, + { + "epoch": 7.22444089456869, + "grad_norm": 0.05868425592780113, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9045 + }, + { + "epoch": 7.225239616613418, + "grad_norm": 0.09483210742473602, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9046 + }, + { + "epoch": 7.226038338658147, + "grad_norm": 0.20051591098308563, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9047 + }, + { + "epoch": 7.226837060702875, + "grad_norm": 0.09253975749015808, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9048 + }, + { + "epoch": 7.227635782747604, + "grad_norm": 0.15865609049797058, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9049 + }, + { + "epoch": 7.228434504792332, + "grad_norm": 0.14421933889389038, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9050 + }, + { + "epoch": 7.229233226837061, + "grad_norm": 0.13492006063461304, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9051 + }, + { + "epoch": 7.2300319488817895, + "grad_norm": 0.06581155210733414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9052 + }, + { + "epoch": 7.230830670926518, + "grad_norm": 0.12610170245170593, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9053 + }, + { + "epoch": 7.231629392971246, + "grad_norm": 0.12813681364059448, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9054 + }, + { + "epoch": 7.232428115015974, + "grad_norm": 0.07228157669305801, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9055 + }, + { + "epoch": 7.233226837060703, + "grad_norm": 0.13456740975379944, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9056 + }, + { + "epoch": 7.234025559105431, + "grad_norm": 0.10491029918193817, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9057 + }, + { + "epoch": 7.23482428115016, + "grad_norm": 0.14090387523174286, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9058 + }, + { + "epoch": 7.235623003194888, + "grad_norm": 0.10722684115171432, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9059 + }, + { + "epoch": 7.236421725239617, + "grad_norm": 0.05123287811875343, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9060 + }, + { + "epoch": 7.237220447284345, + "grad_norm": 0.1203593909740448, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9061 + }, + { + "epoch": 7.238019169329074, + "grad_norm": 0.07847320288419724, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9062 + }, + { + "epoch": 7.2388178913738015, + "grad_norm": 0.09621457010507584, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9063 + }, + { + "epoch": 7.23961661341853, + "grad_norm": 0.11915068328380585, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9064 + }, + { + "epoch": 7.2404153354632586, + "grad_norm": 0.18357326090335846, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9065 + }, + { + "epoch": 7.241214057507987, + "grad_norm": 0.06862817704677582, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9066 + }, + { + "epoch": 7.242012779552716, + "grad_norm": 0.05091634392738342, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9067 + }, + { + "epoch": 7.242811501597444, + "grad_norm": 0.09132825583219528, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9068 + }, + { + "epoch": 7.243610223642173, + "grad_norm": 0.11998780816793442, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9069 + }, + { + "epoch": 7.244408945686901, + "grad_norm": 0.0678768903017044, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9070 + }, + { + "epoch": 7.24520766773163, + "grad_norm": 0.19880260527133942, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9071 + }, + { + "epoch": 7.246006389776358, + "grad_norm": 0.06379543989896774, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9072 + }, + { + "epoch": 7.246805111821086, + "grad_norm": 0.06652764976024628, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9073 + }, + { + "epoch": 7.247603833865814, + "grad_norm": 0.10495885461568832, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9074 + }, + { + "epoch": 7.248402555910543, + "grad_norm": 0.14753985404968262, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9075 + }, + { + "epoch": 7.2492012779552715, + "grad_norm": 0.08283182233572006, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9076 + }, + { + "epoch": 7.25, + "grad_norm": 0.1378672569990158, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9077 + }, + { + "epoch": 7.2507987220447285, + "grad_norm": 0.10274125635623932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9078 + }, + { + "epoch": 7.251597444089457, + "grad_norm": 0.09236814826726913, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9079 + }, + { + "epoch": 7.252396166134186, + "grad_norm": 0.07923156023025513, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9080 + }, + { + "epoch": 7.253194888178914, + "grad_norm": 0.2953792214393616, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9081 + }, + { + "epoch": 7.253993610223642, + "grad_norm": 9.043856620788574, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9082 + }, + { + "epoch": 7.25479233226837, + "grad_norm": 60.094329833984375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9083 + }, + { + "epoch": 7.255591054313099, + "grad_norm": 48.363075256347656, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 9084 + }, + { + "epoch": 7.256389776357827, + "grad_norm": 92.13807678222656, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9085 + }, + { + "epoch": 7.257188498402556, + "grad_norm": 71.66429138183594, + "learning_rate": 0.0005, + "loss": 1.1524, + "step": 9086 + }, + { + "epoch": 7.257987220447284, + "grad_norm": 29.742534637451172, + "learning_rate": 0.0005, + "loss": 1.2362, + "step": 9087 + }, + { + "epoch": 7.258785942492013, + "grad_norm": 1.1841496229171753, + "learning_rate": 0.0005, + "loss": 1.4452, + "step": 9088 + }, + { + "epoch": 7.2595846645367414, + "grad_norm": 0.7909824252128601, + "learning_rate": 0.0005, + "loss": 1.3049, + "step": 9089 + }, + { + "epoch": 7.26038338658147, + "grad_norm": 0.796114444732666, + "learning_rate": 0.0005, + "loss": 1.2852, + "step": 9090 + }, + { + "epoch": 7.261182108626198, + "grad_norm": 0.9014440178871155, + "learning_rate": 0.0005, + "loss": 1.2243, + "step": 9091 + }, + { + "epoch": 7.261980830670926, + "grad_norm": 0.5654944777488708, + "learning_rate": 0.0005, + "loss": 1.1462, + "step": 9092 + }, + { + "epoch": 7.262779552715655, + "grad_norm": 1.0784763097763062, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 9093 + }, + { + "epoch": 7.263578274760383, + "grad_norm": 0.9014595150947571, + "learning_rate": 0.0005, + "loss": 1.1629, + "step": 9094 + }, + { + "epoch": 7.264376996805112, + "grad_norm": 0.4847378730773926, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9095 + }, + { + "epoch": 7.26517571884984, + "grad_norm": 0.5493710041046143, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9096 + }, + { + "epoch": 7.265974440894569, + "grad_norm": 1.0691193342208862, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 9097 + }, + { + "epoch": 7.266773162939297, + "grad_norm": 2.062331199645996, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 9098 + }, + { + "epoch": 7.267571884984026, + "grad_norm": 2.778977632522583, + "learning_rate": 0.0005, + "loss": 1.2775, + "step": 9099 + }, + { + "epoch": 7.268370607028754, + "grad_norm": 0.8807574510574341, + "learning_rate": 0.0005, + "loss": 1.2851, + "step": 9100 + }, + { + "epoch": 7.269169329073483, + "grad_norm": 1.0370792150497437, + "learning_rate": 0.0005, + "loss": 1.1677, + "step": 9101 + }, + { + "epoch": 7.2699680511182105, + "grad_norm": 0.5272591710090637, + "learning_rate": 0.0005, + "loss": 1.1754, + "step": 9102 + }, + { + "epoch": 7.270766773162939, + "grad_norm": 0.5510113835334778, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 9103 + }, + { + "epoch": 7.271565495207668, + "grad_norm": 0.4650730490684509, + "learning_rate": 0.0005, + "loss": 1.1741, + "step": 9104 + }, + { + "epoch": 7.272364217252396, + "grad_norm": 1.071080207824707, + "learning_rate": 0.0005, + "loss": 1.1418, + "step": 9105 + }, + { + "epoch": 7.273162939297125, + "grad_norm": 0.32088524103164673, + "learning_rate": 0.0005, + "loss": 1.1304, + "step": 9106 + }, + { + "epoch": 7.273961661341853, + "grad_norm": 1.2110369205474854, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 9107 + }, + { + "epoch": 7.274760383386582, + "grad_norm": 0.8781233429908752, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 9108 + }, + { + "epoch": 7.27555910543131, + "grad_norm": 0.356841117143631, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9109 + }, + { + "epoch": 7.276357827476039, + "grad_norm": 0.41136255860328674, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 9110 + }, + { + "epoch": 7.277156549520766, + "grad_norm": 0.30638960003852844, + "learning_rate": 0.0005, + "loss": 1.1006, + "step": 9111 + }, + { + "epoch": 7.277955271565495, + "grad_norm": 0.3056134879589081, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 9112 + }, + { + "epoch": 7.2787539936102235, + "grad_norm": 0.3053964376449585, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 9113 + }, + { + "epoch": 7.279552715654952, + "grad_norm": 0.2799919843673706, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 9114 + }, + { + "epoch": 7.2803514376996805, + "grad_norm": 0.19091907143592834, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 9115 + }, + { + "epoch": 7.281150159744409, + "grad_norm": 0.19973579049110413, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 9116 + }, + { + "epoch": 7.281948881789138, + "grad_norm": 0.21867726743221283, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 9117 + }, + { + "epoch": 7.282747603833866, + "grad_norm": 0.10351689904928207, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 9118 + }, + { + "epoch": 7.283546325878595, + "grad_norm": 0.16956113278865814, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 9119 + }, + { + "epoch": 7.284345047923322, + "grad_norm": 0.2959003150463104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 9120 + }, + { + "epoch": 7.285143769968051, + "grad_norm": 0.18194587528705597, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 9121 + }, + { + "epoch": 7.285942492012779, + "grad_norm": 0.10713140666484833, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 9122 + }, + { + "epoch": 7.286741214057508, + "grad_norm": 0.2391309142112732, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9123 + }, + { + "epoch": 7.287539936102236, + "grad_norm": 0.25640085339546204, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 9124 + }, + { + "epoch": 7.288338658146965, + "grad_norm": 0.25697845220565796, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9125 + }, + { + "epoch": 7.289137380191693, + "grad_norm": 0.2679392695426941, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 9126 + }, + { + "epoch": 7.289936102236422, + "grad_norm": 0.3405737280845642, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9127 + }, + { + "epoch": 7.2907348242811505, + "grad_norm": 0.31081417202949524, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 9128 + }, + { + "epoch": 7.291533546325878, + "grad_norm": 0.15159590542316437, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 9129 + }, + { + "epoch": 7.292332268370607, + "grad_norm": 1.1609382629394531, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 9130 + }, + { + "epoch": 7.293130990415335, + "grad_norm": 0.5588571429252625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 9131 + }, + { + "epoch": 7.293929712460064, + "grad_norm": 0.47076234221458435, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9132 + }, + { + "epoch": 7.294728434504792, + "grad_norm": 1.184756875038147, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 9133 + }, + { + "epoch": 7.295527156549521, + "grad_norm": 0.40956422686576843, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9134 + }, + { + "epoch": 7.296325878594249, + "grad_norm": 0.8017024397850037, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 9135 + }, + { + "epoch": 7.297124600638978, + "grad_norm": 0.29993146657943726, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9136 + }, + { + "epoch": 7.297923322683706, + "grad_norm": 0.4549245238304138, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 9137 + }, + { + "epoch": 7.298722044728435, + "grad_norm": 0.26366063952445984, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 9138 + }, + { + "epoch": 7.2995207667731625, + "grad_norm": 0.3126361668109894, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 9139 + }, + { + "epoch": 7.300319488817891, + "grad_norm": 0.18184784054756165, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 9140 + }, + { + "epoch": 7.30111821086262, + "grad_norm": 0.91683429479599, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 9141 + }, + { + "epoch": 7.301916932907348, + "grad_norm": 3.3384642601013184, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9142 + }, + { + "epoch": 7.302715654952077, + "grad_norm": 0.21734145283699036, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 9143 + }, + { + "epoch": 7.303514376996805, + "grad_norm": 0.13850291073322296, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9144 + }, + { + "epoch": 7.304313099041534, + "grad_norm": 0.1737629920244217, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 9145 + }, + { + "epoch": 7.305111821086262, + "grad_norm": 0.3947316110134125, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 9146 + }, + { + "epoch": 7.305910543130991, + "grad_norm": 0.16360799968242645, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9147 + }, + { + "epoch": 7.306709265175719, + "grad_norm": 0.14816711843013763, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9148 + }, + { + "epoch": 7.307507987220447, + "grad_norm": 0.13554179668426514, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9149 + }, + { + "epoch": 7.3083067092651754, + "grad_norm": 0.10308978706598282, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9150 + }, + { + "epoch": 7.309105431309904, + "grad_norm": 0.11216582357883453, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 9151 + }, + { + "epoch": 7.3099041533546325, + "grad_norm": 0.08531700819730759, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9152 + }, + { + "epoch": 7.310702875399361, + "grad_norm": 0.10261841118335724, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9153 + }, + { + "epoch": 7.31150159744409, + "grad_norm": 0.18318074941635132, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9154 + }, + { + "epoch": 7.312300319488818, + "grad_norm": 0.1616939902305603, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9155 + }, + { + "epoch": 7.313099041533547, + "grad_norm": 0.10412739217281342, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9156 + }, + { + "epoch": 7.313897763578275, + "grad_norm": 0.14097075164318085, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9157 + }, + { + "epoch": 7.314696485623003, + "grad_norm": 0.2168329358100891, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 9158 + }, + { + "epoch": 7.315495207667731, + "grad_norm": 0.14337286353111267, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 9159 + }, + { + "epoch": 7.31629392971246, + "grad_norm": 0.10328586399555206, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9160 + }, + { + "epoch": 7.317092651757188, + "grad_norm": 0.15820610523223877, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9161 + }, + { + "epoch": 7.317891373801917, + "grad_norm": 0.11771009862422943, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9162 + }, + { + "epoch": 7.318690095846645, + "grad_norm": 0.06801208108663559, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9163 + }, + { + "epoch": 7.319488817891374, + "grad_norm": 0.08691044896841049, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9164 + }, + { + "epoch": 7.3202875399361025, + "grad_norm": 0.10149878263473511, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9165 + }, + { + "epoch": 7.321086261980831, + "grad_norm": 0.08544973284006119, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9166 + }, + { + "epoch": 7.321884984025559, + "grad_norm": 0.21312831342220306, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9167 + }, + { + "epoch": 7.322683706070287, + "grad_norm": 0.09866507351398468, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9168 + }, + { + "epoch": 7.323482428115016, + "grad_norm": 0.09676753729581833, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9169 + }, + { + "epoch": 7.324281150159744, + "grad_norm": 0.1783452033996582, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9170 + }, + { + "epoch": 7.325079872204473, + "grad_norm": 0.16399280726909637, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9171 + }, + { + "epoch": 7.325878594249201, + "grad_norm": 0.1160425990819931, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9172 + }, + { + "epoch": 7.32667731629393, + "grad_norm": 0.09826952964067459, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9173 + }, + { + "epoch": 7.327476038338658, + "grad_norm": 0.1292516440153122, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9174 + }, + { + "epoch": 7.328274760383387, + "grad_norm": 0.1253383606672287, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9175 + }, + { + "epoch": 7.329073482428115, + "grad_norm": 0.15330855548381805, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9176 + }, + { + "epoch": 7.329872204472843, + "grad_norm": 0.16339725255966187, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9177 + }, + { + "epoch": 7.330670926517572, + "grad_norm": 0.1716328263282776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9178 + }, + { + "epoch": 7.3314696485623, + "grad_norm": 0.07669667154550552, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9179 + }, + { + "epoch": 7.332268370607029, + "grad_norm": 0.06626272946596146, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 9180 + }, + { + "epoch": 7.333067092651757, + "grad_norm": 0.0935940146446228, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9181 + }, + { + "epoch": 7.333865814696486, + "grad_norm": 0.07840511202812195, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9182 + }, + { + "epoch": 7.334664536741214, + "grad_norm": 0.07776588946580887, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9183 + }, + { + "epoch": 7.335463258785943, + "grad_norm": 0.084624283015728, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9184 + }, + { + "epoch": 7.336261980830671, + "grad_norm": 0.07562167197465897, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9185 + }, + { + "epoch": 7.3370607028754, + "grad_norm": 0.08628194034099579, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9186 + }, + { + "epoch": 7.337859424920127, + "grad_norm": 0.0654950812458992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9187 + }, + { + "epoch": 7.338658146964856, + "grad_norm": 0.06403883546590805, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9188 + }, + { + "epoch": 7.3394568690095845, + "grad_norm": 0.8679103851318359, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9189 + }, + { + "epoch": 7.340255591054313, + "grad_norm": 0.42257770895957947, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 9190 + }, + { + "epoch": 7.3410543130990416, + "grad_norm": 0.3017493486404419, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 9191 + }, + { + "epoch": 7.34185303514377, + "grad_norm": 0.30509164929389954, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 9192 + }, + { + "epoch": 7.342651757188499, + "grad_norm": 0.28457221388816833, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 9193 + }, + { + "epoch": 7.343450479233227, + "grad_norm": 0.2734214961528778, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9194 + }, + { + "epoch": 7.344249201277956, + "grad_norm": 0.2931375801563263, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 9195 + }, + { + "epoch": 7.345047923322683, + "grad_norm": 0.11534975469112396, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9196 + }, + { + "epoch": 7.345846645367412, + "grad_norm": 0.1489555388689041, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 9197 + }, + { + "epoch": 7.34664536741214, + "grad_norm": 0.13024470210075378, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 9198 + }, + { + "epoch": 7.347444089456869, + "grad_norm": 0.1413331776857376, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9199 + }, + { + "epoch": 7.348242811501597, + "grad_norm": 0.07862340658903122, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9200 + }, + { + "epoch": 7.349041533546326, + "grad_norm": 0.0870542973279953, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9201 + }, + { + "epoch": 7.3498402555910545, + "grad_norm": 0.07556174695491791, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 9202 + }, + { + "epoch": 7.350638977635783, + "grad_norm": 0.07381146401166916, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9203 + }, + { + "epoch": 7.3514376996805115, + "grad_norm": 0.5006929636001587, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 9204 + }, + { + "epoch": 7.352236421725239, + "grad_norm": 0.2980809807777405, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 9205 + }, + { + "epoch": 7.353035143769968, + "grad_norm": 0.20632435381412506, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 9206 + }, + { + "epoch": 7.353833865814696, + "grad_norm": 0.2028435915708542, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9207 + }, + { + "epoch": 7.354632587859425, + "grad_norm": 0.220264732837677, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9208 + }, + { + "epoch": 7.355431309904153, + "grad_norm": 0.07175029814243317, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9209 + }, + { + "epoch": 7.356230031948882, + "grad_norm": 0.20052626729011536, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9210 + }, + { + "epoch": 7.35702875399361, + "grad_norm": 0.3549690544605255, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9211 + }, + { + "epoch": 7.357827476038339, + "grad_norm": 0.1310572475194931, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9212 + }, + { + "epoch": 7.358626198083067, + "grad_norm": 0.9551740288734436, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 9213 + }, + { + "epoch": 7.359424920127796, + "grad_norm": 0.13663409650325775, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9214 + }, + { + "epoch": 7.360223642172524, + "grad_norm": 0.11436715722084045, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9215 + }, + { + "epoch": 7.361022364217252, + "grad_norm": 0.10911283642053604, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 9216 + }, + { + "epoch": 7.361821086261981, + "grad_norm": 0.11186671257019043, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9217 + }, + { + "epoch": 7.362619808306709, + "grad_norm": 0.1308698207139969, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9218 + }, + { + "epoch": 7.363418530351438, + "grad_norm": 0.07584013044834137, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9219 + }, + { + "epoch": 7.364217252396166, + "grad_norm": 0.07789483666419983, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 9220 + }, + { + "epoch": 7.365015974440895, + "grad_norm": 0.12758736312389374, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 9221 + }, + { + "epoch": 7.365814696485623, + "grad_norm": 0.09310994297266006, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9222 + }, + { + "epoch": 7.366613418530352, + "grad_norm": 0.14761847257614136, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9223 + }, + { + "epoch": 7.36741214057508, + "grad_norm": 0.8784921169281006, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9224 + }, + { + "epoch": 7.368210862619808, + "grad_norm": 0.07754036784172058, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9225 + }, + { + "epoch": 7.3690095846645365, + "grad_norm": 0.06706640869379044, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9226 + }, + { + "epoch": 7.369808306709265, + "grad_norm": 0.0949360579252243, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9227 + }, + { + "epoch": 7.3706070287539935, + "grad_norm": 0.09635552763938904, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9228 + }, + { + "epoch": 7.371405750798722, + "grad_norm": 0.15888135135173798, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9229 + }, + { + "epoch": 7.372204472843451, + "grad_norm": 0.1487814337015152, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9230 + }, + { + "epoch": 7.373003194888179, + "grad_norm": 0.09755469113588333, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9231 + }, + { + "epoch": 7.373801916932908, + "grad_norm": 0.2550356984138489, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9232 + }, + { + "epoch": 7.374600638977636, + "grad_norm": 0.13796621561050415, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9233 + }, + { + "epoch": 7.375399361022364, + "grad_norm": 0.06727192550897598, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9234 + }, + { + "epoch": 7.376198083067092, + "grad_norm": 0.09111928194761276, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 9235 + }, + { + "epoch": 7.376996805111821, + "grad_norm": 0.15708492696285248, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9236 + }, + { + "epoch": 7.377795527156549, + "grad_norm": 0.06607159227132797, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9237 + }, + { + "epoch": 7.378594249201278, + "grad_norm": 0.3495469391345978, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9238 + }, + { + "epoch": 7.3793929712460065, + "grad_norm": 0.249598890542984, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9239 + }, + { + "epoch": 7.380191693290735, + "grad_norm": 0.1506706029176712, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9240 + }, + { + "epoch": 7.3809904153354635, + "grad_norm": 0.2053573578596115, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9241 + }, + { + "epoch": 7.381789137380192, + "grad_norm": 0.20234468579292297, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9242 + }, + { + "epoch": 7.38258785942492, + "grad_norm": 0.23514828085899353, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9243 + }, + { + "epoch": 7.383386581469648, + "grad_norm": 0.13418453931808472, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9244 + }, + { + "epoch": 7.384185303514377, + "grad_norm": 0.07703951746225357, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9245 + }, + { + "epoch": 7.384984025559105, + "grad_norm": 0.20256030559539795, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 9246 + }, + { + "epoch": 7.385782747603834, + "grad_norm": 0.1140165850520134, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9247 + }, + { + "epoch": 7.386581469648562, + "grad_norm": 0.6283542513847351, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9248 + }, + { + "epoch": 7.387380191693291, + "grad_norm": 0.11779789626598358, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9249 + }, + { + "epoch": 7.388178913738019, + "grad_norm": 0.09821031987667084, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9250 + }, + { + "epoch": 7.388977635782748, + "grad_norm": 0.10942906141281128, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9251 + }, + { + "epoch": 7.389776357827476, + "grad_norm": 0.6150240302085876, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9252 + }, + { + "epoch": 7.390575079872204, + "grad_norm": 0.17758208513259888, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9253 + }, + { + "epoch": 7.391373801916933, + "grad_norm": 0.09567593038082123, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9254 + }, + { + "epoch": 7.392172523961661, + "grad_norm": 0.1177724078297615, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9255 + }, + { + "epoch": 7.39297124600639, + "grad_norm": 0.12369771301746368, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9256 + }, + { + "epoch": 7.393769968051118, + "grad_norm": 0.11247415840625763, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9257 + }, + { + "epoch": 7.394568690095847, + "grad_norm": 0.15094342827796936, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9258 + }, + { + "epoch": 7.395367412140575, + "grad_norm": 0.113029845058918, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9259 + }, + { + "epoch": 7.396166134185304, + "grad_norm": 0.1620573252439499, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9260 + }, + { + "epoch": 7.396964856230032, + "grad_norm": 0.10010898113250732, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9261 + }, + { + "epoch": 7.397763578274761, + "grad_norm": 0.21061348915100098, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9262 + }, + { + "epoch": 7.3985623003194885, + "grad_norm": 0.06199006363749504, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9263 + }, + { + "epoch": 7.399361022364217, + "grad_norm": 0.09612002968788147, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9264 + }, + { + "epoch": 7.4001597444089455, + "grad_norm": 0.13255780935287476, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9265 + }, + { + "epoch": 7.400958466453674, + "grad_norm": 0.22877056896686554, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9266 + }, + { + "epoch": 7.401757188498403, + "grad_norm": 0.18957512080669403, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9267 + }, + { + "epoch": 7.402555910543131, + "grad_norm": 0.211961030960083, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9268 + }, + { + "epoch": 7.40335463258786, + "grad_norm": 0.07744339853525162, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9269 + }, + { + "epoch": 7.404153354632588, + "grad_norm": 0.19085711240768433, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9270 + }, + { + "epoch": 7.404952076677317, + "grad_norm": 0.13099227845668793, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9271 + }, + { + "epoch": 7.405750798722044, + "grad_norm": 0.24543818831443787, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9272 + }, + { + "epoch": 7.406549520766773, + "grad_norm": 0.18623757362365723, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9273 + }, + { + "epoch": 7.407348242811501, + "grad_norm": 0.06898430734872818, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9274 + }, + { + "epoch": 7.40814696485623, + "grad_norm": 0.1809006780385971, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9275 + }, + { + "epoch": 7.4089456869009584, + "grad_norm": 0.11338596791028976, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9276 + }, + { + "epoch": 7.409744408945687, + "grad_norm": 0.10182031989097595, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9277 + }, + { + "epoch": 7.4105431309904155, + "grad_norm": 0.1521865278482437, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9278 + }, + { + "epoch": 7.411341853035144, + "grad_norm": 0.08848808705806732, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9279 + }, + { + "epoch": 7.412140575079873, + "grad_norm": 0.10398431867361069, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9280 + }, + { + "epoch": 7.4129392971246, + "grad_norm": 0.10145912319421768, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9281 + }, + { + "epoch": 7.413738019169329, + "grad_norm": 0.12386789917945862, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9282 + }, + { + "epoch": 7.414536741214057, + "grad_norm": 0.09763981401920319, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9283 + }, + { + "epoch": 7.415335463258786, + "grad_norm": 0.08810468763113022, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9284 + }, + { + "epoch": 7.416134185303514, + "grad_norm": 0.06196752190589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9285 + }, + { + "epoch": 7.416932907348243, + "grad_norm": 1.4297560453414917, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9286 + }, + { + "epoch": 7.417731629392971, + "grad_norm": 0.07783587276935577, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9287 + }, + { + "epoch": 7.4185303514377, + "grad_norm": 0.3592485189437866, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9288 + }, + { + "epoch": 7.419329073482428, + "grad_norm": 0.10796934366226196, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9289 + }, + { + "epoch": 7.420127795527157, + "grad_norm": 0.11450864374637604, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9290 + }, + { + "epoch": 7.420926517571885, + "grad_norm": 0.06718776375055313, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9291 + }, + { + "epoch": 7.421725239616613, + "grad_norm": 0.1776629537343979, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 9292 + }, + { + "epoch": 7.422523961661342, + "grad_norm": 0.058177318423986435, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9293 + }, + { + "epoch": 7.42332268370607, + "grad_norm": 0.08145572990179062, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9294 + }, + { + "epoch": 7.424121405750799, + "grad_norm": 0.07605774700641632, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9295 + }, + { + "epoch": 7.424920127795527, + "grad_norm": 0.5453565120697021, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9296 + }, + { + "epoch": 7.425718849840256, + "grad_norm": 0.08215200155973434, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9297 + }, + { + "epoch": 7.426517571884984, + "grad_norm": 0.06014016270637512, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9298 + }, + { + "epoch": 7.427316293929713, + "grad_norm": 0.11043576151132584, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9299 + }, + { + "epoch": 7.428115015974441, + "grad_norm": 0.1421220898628235, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9300 + }, + { + "epoch": 7.428913738019169, + "grad_norm": 0.10473544150590897, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9301 + }, + { + "epoch": 7.4297124600638975, + "grad_norm": 0.09921323508024216, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9302 + }, + { + "epoch": 7.430511182108626, + "grad_norm": 0.07775744050741196, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9303 + }, + { + "epoch": 7.431309904153355, + "grad_norm": 0.3015517294406891, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9304 + }, + { + "epoch": 7.432108626198083, + "grad_norm": 0.06826018542051315, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9305 + }, + { + "epoch": 7.432907348242812, + "grad_norm": 0.06002574786543846, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9306 + }, + { + "epoch": 7.43370607028754, + "grad_norm": 0.07082310318946838, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9307 + }, + { + "epoch": 7.434504792332269, + "grad_norm": 0.1356203258037567, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9308 + }, + { + "epoch": 7.435303514376997, + "grad_norm": 0.09689080715179443, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9309 + }, + { + "epoch": 7.436102236421725, + "grad_norm": 0.0938429981470108, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9310 + }, + { + "epoch": 7.436900958466453, + "grad_norm": 0.0853746086359024, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9311 + }, + { + "epoch": 7.437699680511182, + "grad_norm": 0.09427982568740845, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9312 + }, + { + "epoch": 7.43849840255591, + "grad_norm": 0.14042942225933075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9313 + }, + { + "epoch": 7.439297124600639, + "grad_norm": 0.4248291552066803, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9314 + }, + { + "epoch": 7.4400958466453675, + "grad_norm": 0.18214350938796997, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9315 + }, + { + "epoch": 7.440894568690096, + "grad_norm": 0.2564402222633362, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 9316 + }, + { + "epoch": 7.4416932907348246, + "grad_norm": 0.10012423992156982, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9317 + }, + { + "epoch": 7.442492012779553, + "grad_norm": 0.15337461233139038, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 9318 + }, + { + "epoch": 7.443290734824281, + "grad_norm": 0.1396649181842804, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9319 + }, + { + "epoch": 7.444089456869009, + "grad_norm": 0.12310001254081726, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9320 + }, + { + "epoch": 7.444888178913738, + "grad_norm": 0.12932278215885162, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9321 + }, + { + "epoch": 7.445686900958466, + "grad_norm": 0.12403959035873413, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9322 + }, + { + "epoch": 7.446485623003195, + "grad_norm": 0.4164578318595886, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9323 + }, + { + "epoch": 7.447284345047923, + "grad_norm": 0.2015235871076584, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 9324 + }, + { + "epoch": 7.448083067092652, + "grad_norm": 0.2619101107120514, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9325 + }, + { + "epoch": 7.44888178913738, + "grad_norm": 0.07511210441589355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9326 + }, + { + "epoch": 7.449680511182109, + "grad_norm": 7.956277370452881, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9327 + }, + { + "epoch": 7.4504792332268375, + "grad_norm": 0.23822273313999176, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 9328 + }, + { + "epoch": 7.451277955271565, + "grad_norm": 0.1565391719341278, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 9329 + }, + { + "epoch": 7.452076677316294, + "grad_norm": 0.15820777416229248, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 9330 + }, + { + "epoch": 7.452875399361022, + "grad_norm": 0.16341058909893036, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 9331 + }, + { + "epoch": 7.453674121405751, + "grad_norm": 0.19414658844470978, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 9332 + }, + { + "epoch": 7.454472843450479, + "grad_norm": 0.18798880279064178, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9333 + }, + { + "epoch": 7.455271565495208, + "grad_norm": 0.09032963961362839, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 9334 + }, + { + "epoch": 7.456070287539936, + "grad_norm": 0.12746790051460266, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9335 + }, + { + "epoch": 7.456869009584665, + "grad_norm": 0.34985360503196716, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9336 + }, + { + "epoch": 7.457667731629393, + "grad_norm": 0.22745627164840698, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 9337 + }, + { + "epoch": 7.458466453674122, + "grad_norm": 1.297531247138977, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9338 + }, + { + "epoch": 7.4592651757188495, + "grad_norm": 0.3254985809326172, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9339 + }, + { + "epoch": 7.460063897763578, + "grad_norm": 0.28899863362312317, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9340 + }, + { + "epoch": 7.460862619808307, + "grad_norm": 0.09964017570018768, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9341 + }, + { + "epoch": 7.461661341853035, + "grad_norm": 0.2713227868080139, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9342 + }, + { + "epoch": 7.462460063897764, + "grad_norm": 0.16604198515415192, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 9343 + }, + { + "epoch": 7.463258785942492, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 9344 + }, + { + "epoch": 7.464057507987221, + "grad_norm": 0.20081757009029388, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9345 + }, + { + "epoch": 7.464856230031949, + "grad_norm": 0.14005789160728455, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 9346 + }, + { + "epoch": 7.465654952076678, + "grad_norm": 0.15481705963611603, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9347 + }, + { + "epoch": 7.466453674121405, + "grad_norm": 0.1843721717596054, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9348 + }, + { + "epoch": 7.467252396166134, + "grad_norm": 0.11873828619718552, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 9349 + }, + { + "epoch": 7.468051118210862, + "grad_norm": 0.199008509516716, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9350 + }, + { + "epoch": 7.468849840255591, + "grad_norm": 0.10533998161554337, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9351 + }, + { + "epoch": 7.4696485623003195, + "grad_norm": 0.4823262691497803, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 9352 + }, + { + "epoch": 7.470447284345048, + "grad_norm": 0.25044289231300354, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9353 + }, + { + "epoch": 7.4712460063897765, + "grad_norm": 0.11273030936717987, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9354 + }, + { + "epoch": 7.472044728434505, + "grad_norm": 0.15552200376987457, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9355 + }, + { + "epoch": 7.472843450479234, + "grad_norm": 0.2211492508649826, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 9356 + }, + { + "epoch": 7.473642172523961, + "grad_norm": 0.38023853302001953, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9357 + }, + { + "epoch": 7.47444089456869, + "grad_norm": 0.15553027391433716, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9358 + }, + { + "epoch": 7.475239616613418, + "grad_norm": 0.11964324861764908, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9359 + }, + { + "epoch": 7.476038338658147, + "grad_norm": 0.06454652547836304, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9360 + }, + { + "epoch": 7.476837060702875, + "grad_norm": 0.090255506336689, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9361 + }, + { + "epoch": 7.477635782747604, + "grad_norm": 0.07100088149309158, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9362 + }, + { + "epoch": 7.478434504792332, + "grad_norm": 0.14697550237178802, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9363 + }, + { + "epoch": 7.479233226837061, + "grad_norm": 0.14088693261146545, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9364 + }, + { + "epoch": 7.4800319488817895, + "grad_norm": 0.12696029245853424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9365 + }, + { + "epoch": 7.480830670926517, + "grad_norm": 0.15335378050804138, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 9366 + }, + { + "epoch": 7.481629392971246, + "grad_norm": 0.10186830163002014, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9367 + }, + { + "epoch": 7.482428115015974, + "grad_norm": 0.11318683624267578, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 9368 + }, + { + "epoch": 7.483226837060703, + "grad_norm": 0.1290084272623062, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9369 + }, + { + "epoch": 7.484025559105431, + "grad_norm": 0.160775288939476, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 9370 + }, + { + "epoch": 7.48482428115016, + "grad_norm": 0.1998366117477417, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9371 + }, + { + "epoch": 7.485623003194888, + "grad_norm": 0.15808500349521637, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9372 + }, + { + "epoch": 7.486421725239617, + "grad_norm": 0.15403985977172852, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9373 + }, + { + "epoch": 7.487220447284345, + "grad_norm": 0.11963094770908356, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9374 + }, + { + "epoch": 7.488019169329074, + "grad_norm": 0.058245617896318436, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9375 + }, + { + "epoch": 7.488817891373802, + "grad_norm": 0.1256275773048401, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9376 + }, + { + "epoch": 7.48961661341853, + "grad_norm": 0.09230747818946838, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9377 + }, + { + "epoch": 7.4904153354632586, + "grad_norm": 0.15109197795391083, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9378 + }, + { + "epoch": 7.491214057507987, + "grad_norm": 0.20005039870738983, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9379 + }, + { + "epoch": 7.492012779552716, + "grad_norm": 0.08591387420892715, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9380 + }, + { + "epoch": 7.492811501597444, + "grad_norm": 0.07975071668624878, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9381 + }, + { + "epoch": 7.493610223642173, + "grad_norm": 0.1258707046508789, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9382 + }, + { + "epoch": 7.494408945686901, + "grad_norm": 0.16978499293327332, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9383 + }, + { + "epoch": 7.49520766773163, + "grad_norm": 0.09052985906600952, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9384 + }, + { + "epoch": 7.496006389776358, + "grad_norm": 0.15344351530075073, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9385 + }, + { + "epoch": 7.496805111821086, + "grad_norm": 0.04684900864958763, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9386 + }, + { + "epoch": 7.497603833865814, + "grad_norm": 0.09235356748104095, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9387 + }, + { + "epoch": 7.498402555910543, + "grad_norm": 0.0924983024597168, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9388 + }, + { + "epoch": 7.4992012779552715, + "grad_norm": 0.12623359262943268, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9389 + }, + { + "epoch": 7.5, + "grad_norm": 0.08572034537792206, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9390 + }, + { + "epoch": 7.5007987220447285, + "grad_norm": 0.12267094850540161, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9391 + }, + { + "epoch": 7.501597444089457, + "grad_norm": 0.20448675751686096, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9392 + }, + { + "epoch": 7.502396166134186, + "grad_norm": 0.21579930186271667, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9393 + }, + { + "epoch": 7.503194888178914, + "grad_norm": 0.22682903707027435, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9394 + }, + { + "epoch": 7.503993610223642, + "grad_norm": 0.08659582585096359, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 9395 + }, + { + "epoch": 7.50479233226837, + "grad_norm": 0.2064916491508484, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9396 + }, + { + "epoch": 7.505591054313099, + "grad_norm": 0.2137736678123474, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9397 + }, + { + "epoch": 7.506389776357827, + "grad_norm": 0.10891635715961456, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9398 + }, + { + "epoch": 7.507188498402556, + "grad_norm": 0.23018239438533783, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9399 + }, + { + "epoch": 7.507987220447284, + "grad_norm": 0.2091149538755417, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9400 + }, + { + "epoch": 7.508785942492013, + "grad_norm": 0.11136184632778168, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9401 + }, + { + "epoch": 7.5095846645367414, + "grad_norm": 0.1327456831932068, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9402 + }, + { + "epoch": 7.51038338658147, + "grad_norm": 0.08780363947153091, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9403 + }, + { + "epoch": 7.511182108626198, + "grad_norm": 0.14448396861553192, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9404 + }, + { + "epoch": 7.511980830670926, + "grad_norm": 0.12194132804870605, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9405 + }, + { + "epoch": 7.512779552715655, + "grad_norm": 0.09898994117975235, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9406 + }, + { + "epoch": 7.513578274760383, + "grad_norm": 0.0753403753042221, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9407 + }, + { + "epoch": 7.514376996805112, + "grad_norm": 0.1947120577096939, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9408 + }, + { + "epoch": 7.51517571884984, + "grad_norm": 0.10827653110027313, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9409 + }, + { + "epoch": 7.515974440894569, + "grad_norm": 0.06353825330734253, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9410 + }, + { + "epoch": 7.516773162939297, + "grad_norm": 0.16961680352687836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9411 + }, + { + "epoch": 7.517571884984026, + "grad_norm": 0.09001661092042923, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9412 + }, + { + "epoch": 7.518370607028754, + "grad_norm": 0.07342718541622162, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9413 + }, + { + "epoch": 7.519169329073483, + "grad_norm": 0.1001489907503128, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9414 + }, + { + "epoch": 7.5199680511182105, + "grad_norm": 0.10038813948631287, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9415 + }, + { + "epoch": 7.520766773162939, + "grad_norm": 0.17261064052581787, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9416 + }, + { + "epoch": 7.521565495207668, + "grad_norm": 0.10589580982923508, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9417 + }, + { + "epoch": 7.522364217252396, + "grad_norm": 0.055702172219753265, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9418 + }, + { + "epoch": 7.523162939297125, + "grad_norm": 0.122915118932724, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9419 + }, + { + "epoch": 7.523961661341853, + "grad_norm": 0.07361354678869247, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9420 + }, + { + "epoch": 7.524760383386582, + "grad_norm": 0.11187693476676941, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9421 + }, + { + "epoch": 7.52555910543131, + "grad_norm": 0.06205413118004799, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9422 + }, + { + "epoch": 7.526357827476039, + "grad_norm": 0.07805868983268738, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9423 + }, + { + "epoch": 7.527156549520766, + "grad_norm": 0.14349821209907532, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9424 + }, + { + "epoch": 7.527955271565495, + "grad_norm": 0.08928489685058594, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9425 + }, + { + "epoch": 7.5287539936102235, + "grad_norm": 0.10026145726442337, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 9426 + }, + { + "epoch": 7.529552715654952, + "grad_norm": 0.10531286150217056, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9427 + }, + { + "epoch": 7.5303514376996805, + "grad_norm": 0.15984703600406647, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9428 + }, + { + "epoch": 7.531150159744409, + "grad_norm": 0.2948785126209259, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9429 + }, + { + "epoch": 7.531948881789138, + "grad_norm": 0.08823632448911667, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9430 + }, + { + "epoch": 7.532747603833866, + "grad_norm": 0.23016497492790222, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9431 + }, + { + "epoch": 7.533546325878595, + "grad_norm": 0.08874809741973877, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9432 + }, + { + "epoch": 7.534345047923322, + "grad_norm": 0.09074181318283081, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9433 + }, + { + "epoch": 7.535143769968051, + "grad_norm": 0.15151673555374146, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9434 + }, + { + "epoch": 7.535942492012779, + "grad_norm": 0.12276771664619446, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9435 + }, + { + "epoch": 7.536741214057508, + "grad_norm": 0.13978977501392365, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9436 + }, + { + "epoch": 7.537539936102236, + "grad_norm": 0.16208869218826294, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 9437 + }, + { + "epoch": 7.538338658146965, + "grad_norm": 0.16932648420333862, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9438 + }, + { + "epoch": 7.539137380191693, + "grad_norm": 0.09139750897884369, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9439 + }, + { + "epoch": 7.539936102236422, + "grad_norm": 0.11264985054731369, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9440 + }, + { + "epoch": 7.5407348242811505, + "grad_norm": 0.13534623384475708, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9441 + }, + { + "epoch": 7.541533546325878, + "grad_norm": 0.16307172179222107, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9442 + }, + { + "epoch": 7.542332268370607, + "grad_norm": 0.09774577617645264, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9443 + }, + { + "epoch": 7.543130990415335, + "grad_norm": 0.1296136975288391, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9444 + }, + { + "epoch": 7.543929712460064, + "grad_norm": 0.08055619895458221, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9445 + }, + { + "epoch": 7.544728434504792, + "grad_norm": 0.2668273448944092, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9446 + }, + { + "epoch": 7.545527156549521, + "grad_norm": 0.1507730782032013, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9447 + }, + { + "epoch": 7.546325878594249, + "grad_norm": 0.17098994553089142, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9448 + }, + { + "epoch": 7.547124600638978, + "grad_norm": 0.22425173223018646, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9449 + }, + { + "epoch": 7.547923322683706, + "grad_norm": 0.3074493706226349, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9450 + }, + { + "epoch": 7.548722044728435, + "grad_norm": 0.1917268931865692, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9451 + }, + { + "epoch": 7.549520766773163, + "grad_norm": 0.21276478469371796, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9452 + }, + { + "epoch": 7.550319488817891, + "grad_norm": 0.2990981638431549, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9453 + }, + { + "epoch": 7.55111821086262, + "grad_norm": 0.21135985851287842, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9454 + }, + { + "epoch": 7.551916932907348, + "grad_norm": 0.1154661774635315, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9455 + }, + { + "epoch": 7.552715654952077, + "grad_norm": 0.13149744272232056, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9456 + }, + { + "epoch": 7.553514376996805, + "grad_norm": 0.36513134837150574, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9457 + }, + { + "epoch": 7.554313099041534, + "grad_norm": 0.2005227655172348, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9458 + }, + { + "epoch": 7.555111821086262, + "grad_norm": 0.22272491455078125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9459 + }, + { + "epoch": 7.555910543130991, + "grad_norm": 0.05990196391940117, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9460 + }, + { + "epoch": 7.556709265175719, + "grad_norm": 0.20874981582164764, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9461 + }, + { + "epoch": 7.557507987220447, + "grad_norm": 0.10478242486715317, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9462 + }, + { + "epoch": 7.5583067092651754, + "grad_norm": 0.2455470710992813, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9463 + }, + { + "epoch": 7.559105431309904, + "grad_norm": 0.31378838419914246, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9464 + }, + { + "epoch": 7.5599041533546325, + "grad_norm": 0.1903901994228363, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9465 + }, + { + "epoch": 7.560702875399361, + "grad_norm": 0.34334853291511536, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9466 + }, + { + "epoch": 7.56150159744409, + "grad_norm": 0.20050539076328278, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9467 + }, + { + "epoch": 7.562300319488818, + "grad_norm": 0.14147023856639862, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9468 + }, + { + "epoch": 7.563099041533547, + "grad_norm": 0.2242746353149414, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9469 + }, + { + "epoch": 7.563897763578275, + "grad_norm": 0.10040932893753052, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9470 + }, + { + "epoch": 7.564696485623003, + "grad_norm": 0.2527815103530884, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9471 + }, + { + "epoch": 7.565495207667731, + "grad_norm": 0.1675105094909668, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9472 + }, + { + "epoch": 7.56629392971246, + "grad_norm": 0.23818080127239227, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9473 + }, + { + "epoch": 7.567092651757188, + "grad_norm": 0.31956857442855835, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 9474 + }, + { + "epoch": 7.567891373801917, + "grad_norm": 0.15272031724452972, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9475 + }, + { + "epoch": 7.568690095846645, + "grad_norm": 0.20540206134319305, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9476 + }, + { + "epoch": 7.569488817891374, + "grad_norm": 0.2269754856824875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9477 + }, + { + "epoch": 7.5702875399361025, + "grad_norm": 0.19880101084709167, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9478 + }, + { + "epoch": 7.571086261980831, + "grad_norm": 0.2734098732471466, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9479 + }, + { + "epoch": 7.571884984025559, + "grad_norm": 0.17886638641357422, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9480 + }, + { + "epoch": 7.572683706070287, + "grad_norm": 0.15882767736911774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9481 + }, + { + "epoch": 7.573482428115016, + "grad_norm": 0.18066628277301788, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9482 + }, + { + "epoch": 7.574281150159744, + "grad_norm": 0.1025780662894249, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9483 + }, + { + "epoch": 7.575079872204473, + "grad_norm": 0.09417031705379486, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9484 + }, + { + "epoch": 7.575878594249201, + "grad_norm": 0.26811933517456055, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9485 + }, + { + "epoch": 7.57667731629393, + "grad_norm": 0.07128968089818954, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9486 + }, + { + "epoch": 7.577476038338658, + "grad_norm": 0.13026759028434753, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9487 + }, + { + "epoch": 7.578274760383387, + "grad_norm": 0.09879457950592041, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 9488 + }, + { + "epoch": 7.5790734824281145, + "grad_norm": 0.15383538603782654, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9489 + }, + { + "epoch": 7.579872204472844, + "grad_norm": 0.17010194063186646, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 9490 + }, + { + "epoch": 7.580670926517572, + "grad_norm": 0.09413834661245346, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9491 + }, + { + "epoch": 7.5814696485623, + "grad_norm": 0.13111010193824768, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9492 + }, + { + "epoch": 7.582268370607029, + "grad_norm": 0.14170758426189423, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9493 + }, + { + "epoch": 7.583067092651757, + "grad_norm": 0.10549119114875793, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9494 + }, + { + "epoch": 7.583865814696486, + "grad_norm": 0.06767291575670242, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9495 + }, + { + "epoch": 7.584664536741214, + "grad_norm": 0.3329547643661499, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9496 + }, + { + "epoch": 7.585463258785943, + "grad_norm": 0.09325312823057175, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9497 + }, + { + "epoch": 7.586261980830671, + "grad_norm": 0.11408714950084686, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 9498 + }, + { + "epoch": 7.5870607028754, + "grad_norm": 0.10127131640911102, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9499 + }, + { + "epoch": 7.587859424920127, + "grad_norm": 0.14656123518943787, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9500 + } + ], + "logging_steps": 1.0, + "max_steps": 751200, + "num_input_tokens_seen": 0, + "num_train_epochs": 600, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.371660877961888e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9500/training_args.bin b/checkpoint-9500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0162074424e3714af8119d3be2b6e69cbb5b9f2 --- /dev/null +++ b/checkpoint-9500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06816c37733f99d23f044cefd981b2f404a72ddf40fa59f794154596b842fa95 +size 6072 diff --git a/checkpoint-9750/config.json b/checkpoint-9750/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ab0ae1ba49f17c446b66e627e5e96aa2c97bb02d --- /dev/null +++ b/checkpoint-9750/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "checkpoints/vlm_dc-vae-f32c32-sana-1.1_layerwise-0_group-7/checkpoint-9250", + "ar_steps": 1, + "architectures": [ + "DiffVLMDiffusion" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "condition_layer": -1, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "image_token_id": 151655, + "img_cross_attention_dim": 2048, + "img_diffuser_depth": 6, + "img_ffn_dim_multiplier": null, + "img_hidden_size": 1536, + "img_multiple_of": 256, + "img_norm_eps": 1e-05, + "img_num_attention_heads": 12, + "img_num_kv_heads": 12, + "img_qk_norm": true, + "in_channels": 32, + "initializer_range": 0.02, + "inject_img_diffuser": false, + "input_size": 32, + "intermediate_size": 8960, + "layer_group_size": 7, + "layerwise_start_idx": 0, + "lora_alpha": 256, + "lora_bias": "none", + "lora_dropout": 0.05, + "lora_enable": false, + "lora_r": 128, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_vl", + "non_linearity": 1, + "norm_elementwise_affine": true, + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "patch_size": 2, + "repa_coeff": 0.1, + "repa_layers": "2", + "repa_shared": false, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "sample_size": 128, + "sampling_steps": 28, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.47.0", + "use_cache": true, + "use_repa": false, + "use_residual_attn": false, + "use_sliding_window": false, + "vae_path": "mit-han-lab/dc-ae-f32c32-sana-1.1-diffusers", + "video_token_id": 151656, + "vision_config": { + "hidden_size": 1536, + "in_chans": 3, + "model_type": "qwen2_vl", + "spatial_patch_size": 14 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654, + "vocab_size": 151936 +} diff --git a/checkpoint-9750/generation_config.json b/checkpoint-9750/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3d465da90cb0ab59d8ba7babdefb0e88fbfa6b --- /dev/null +++ b/checkpoint-9750/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": 151645, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-9750/model-00001-of-00002.safetensors b/checkpoint-9750/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..159d7c8aad89afd291d17f010c3ae7b3adbff5fe --- /dev/null +++ b/checkpoint-9750/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381565de8ced6edbf7b4e33b4ac5faac4d549911a33bf0dbf525dae559e10260 +size 4998598816 diff --git a/checkpoint-9750/model-00002-of-00002.safetensors b/checkpoint-9750/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4f7a66ea2018a9649b711a37f9dc24b837b92027 --- /dev/null +++ b/checkpoint-9750/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ab65f9c6e008e224198d29727410a1d01306cd77deb5ab889b9fc59730a70e7 +size 4990560652 diff --git a/checkpoint-9750/model.safetensors.index.json b/checkpoint-9750/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..b3b85b852967adb370204fb2c3e3d18822b10ab5 --- /dev/null +++ b/checkpoint-9750/model.safetensors.index.json @@ -0,0 +1,1740 @@ +{ + "metadata": { + "total_size": 9988962252 + }, + "weight_map": { + "embed_tokens.weight": "model-00002-of-00002.safetensors", + "img2text.bias": "model-00001-of-00002.safetensors", + "img2text.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_1.weight": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.bias": "model-00001-of-00002.safetensors", + "img_norm_out.linear_2.weight": "model-00001-of-00002.safetensors", + "layers.0.gate": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.0.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.0.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.0.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.0.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.1.gate": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.1.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.1.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.1.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.1.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.10.gate": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.10.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.10.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.10.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.10.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.11.gate": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.11.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.11.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.11.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.11.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.12.gate": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.12.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.12.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.12.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.12.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.13.gate": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.13.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.13.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.13.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.13.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.14.gate": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.14.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.14.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.14.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.14.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.15.gate": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.15.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.15.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.15.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.15.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.16.gate": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.16.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.16.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.16.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.16.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.17.gate": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.17.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.17.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.17.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.17.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.17.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.17.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.17.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.18.gate": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.18.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.18.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.18.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.18.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.19.gate": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.19.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.19.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.19.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.19.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.2.gate": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.2.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.2.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.2.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.2.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.20.gate": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.20.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.20.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.20.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.20.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.21.gate": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.21.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.21.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.21.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.21.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.22.gate": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.22.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.22.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.22.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.22.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.23.gate": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.23.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.23.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.23.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.23.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.24.gate": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.24.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.24.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.24.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.24.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.25.gate": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.25.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.25.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.25.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.25.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.26.gate": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.26.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.26.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.26.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.26.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.27.gate": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.bias": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.norm_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_k.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_out.0.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_q.weight": "model-00002-of-00002.safetensors", + "layers.27.img_attn1.to_v.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_ffn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_post_mixed_attn_norm.weight": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.bias": "model-00002-of-00002.safetensors", + "layers.27.img_scale_shift.linear.weight": "model-00002-of-00002.safetensors", + "layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.down_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.gate_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.mlp.up_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.k_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.o_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.q_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_A.weight": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.bias": "model-00002-of-00002.safetensors", + "layers.27.self_attn.v_proj_lora.lora_B.weight": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.bias": "model-00002-of-00002.safetensors", + "layers.27.to_img_diffuser.weight": "model-00002-of-00002.safetensors", + "layers.3.gate": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.3.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.3.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.3.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.3.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.4.gate": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.4.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.4.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.4.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.4.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.5.gate": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.5.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.5.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.5.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.5.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.6.gate": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.6.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.6.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.6.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.6.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.7.gate": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.7.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.7.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.7.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.7.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.8.gate": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.8.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.8.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.8.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.8.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "layers.9.gate": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.bias": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.norm_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_k.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_out.0.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_q.weight": "model-00001-of-00002.safetensors", + "layers.9.img_attn1.to_v.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_ffn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_post_mixed_attn_norm.weight": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.bias": "model-00001-of-00002.safetensors", + "layers.9.img_scale_shift.linear.weight": "model-00001-of-00002.safetensors", + "layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.down_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.gate_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.mlp.up_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.k_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.o_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.q_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_A.weight": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.bias": "model-00001-of-00002.safetensors", + "layers.9.self_attn.v_proj_lora.lora_B.weight": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.bias": "model-00001-of-00002.safetensors", + "layers.9.to_img_diffuser.weight": "model-00001-of-00002.safetensors", + "norm.weight": "model-00002-of-00002.safetensors", + "patch_embedder.proj.bias": "model-00001-of-00002.safetensors", + "patch_embedder.proj.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_1.weight": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.bias": "model-00001-of-00002.safetensors", + "t_embedder.timestep_embedder.linear_2.weight": "model-00001-of-00002.safetensors", + "vae.decoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.0.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.1.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.2.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv1.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.conv2.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.2.3.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.3.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.0.conv.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.4.3.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.decoder.up_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.conv_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.0.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.1.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.0.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv1.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.conv2.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.1.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.2.2.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.3.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.2.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.4.3.conv.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.0.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.1.conv_out.norm.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.norm_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_k.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_q.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_in.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_qkv_multiscale.0.proj_out.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.attn.to_v.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_depth.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_inverted.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.conv_point.weight": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.bias": "model-00002-of-00002.safetensors", + "vae.encoder.down_blocks.5.2.conv_out.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-9750/optimizer.pt b/checkpoint-9750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..43f299d21a8a68296c96402a1ceccd70f62b3e4d --- /dev/null +++ b/checkpoint-9750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b45325d26849c50a5e36d9223a8419733927708b8ae01bf25c446581dd016407 +size 15084326534 diff --git a/checkpoint-9750/rng_state_0.pth b/checkpoint-9750/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bab1d965098c585e857e45f5e252f53c2c31320 --- /dev/null +++ b/checkpoint-9750/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87da13b958005a7abe509ee3ca425bd5dea7dada2dafa933a86a9952d3116219 +size 15984 diff --git a/checkpoint-9750/rng_state_1.pth b/checkpoint-9750/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..296de5ea6c67ef2afbf286992a1ddd2a8e6f2551 --- /dev/null +++ b/checkpoint-9750/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:798acb12d0bed3c3d0371141f2f39a1026cc9ccf513fbe6b601fcd7f3713d363 +size 15984 diff --git a/checkpoint-9750/rng_state_2.pth b/checkpoint-9750/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..751c721f3bcda662b97eb59e3241fa155ab9b14f --- /dev/null +++ b/checkpoint-9750/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45415618442ab4dc203efb56bfd73d11b95484fc3520ce84fb6717346072503c +size 15984 diff --git a/checkpoint-9750/rng_state_3.pth b/checkpoint-9750/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d63cf06393705c985c3439b3be69628d7e3efa9 --- /dev/null +++ b/checkpoint-9750/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d434ca2dbbba6107d4c052f63c04cca7125b84b3bb8459eb2c29d9a25c6cd2 +size 15984 diff --git a/checkpoint-9750/rng_state_4.pth b/checkpoint-9750/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..3041a3188c2648ba74f98038b7583662834b5562 --- /dev/null +++ b/checkpoint-9750/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07df8be18f3e35d97d9ca4d0ea043a0eb3edd48faaf1328191352d4c699e9fdf +size 15984 diff --git a/checkpoint-9750/rng_state_5.pth b/checkpoint-9750/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b01a7e7dfec02bb52b4f2083b4e2c9c3d09a503 --- /dev/null +++ b/checkpoint-9750/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d622b1a29dba33f88da34611a11ef9455956549a658713176548e0ce018fad3 +size 15984 diff --git a/checkpoint-9750/rng_state_6.pth b/checkpoint-9750/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..a6caab9ac68cd3c4f425f2958ac0aea5c6fcd184 --- /dev/null +++ b/checkpoint-9750/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9558c4d79a7434e988d9b9fc8ce76b2055c778fbc2e08974c94de99e34336e +size 15984 diff --git a/checkpoint-9750/rng_state_7.pth b/checkpoint-9750/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e18582abcd27190782a111e0725a6e7b7cff82d --- /dev/null +++ b/checkpoint-9750/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e80a69ebce9a6e24afd77bcbe293ab065d29056c15df198b271aba5f1c0c2a7 +size 15984 diff --git a/checkpoint-9750/scheduler.pt b/checkpoint-9750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd9304ee5bcb40a5d8bf9d25d7a40a5486717eda --- /dev/null +++ b/checkpoint-9750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a84e3aae193ee9563fc7d23b170631364aa49ab8e79c7d19ae06771164ebfc8 +size 1064 diff --git a/checkpoint-9750/trainer_state.json b/checkpoint-9750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..873df0dc1979415940068c0d411501c0f33b2106 --- /dev/null +++ b/checkpoint-9750/trainer_state.json @@ -0,0 +1,68283 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.787539936102236, + "eval_steps": 500, + "global_step": 9750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007987220447284345, + "grad_norm": 0.08758673816919327, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1 + }, + { + "epoch": 0.001597444089456869, + "grad_norm": 2.9034857749938965, + "learning_rate": 0.0005, + "loss": 1.5342, + "step": 2 + }, + { + "epoch": 0.0023961661341853034, + "grad_norm": 1.260856032371521, + "learning_rate": 0.0005, + "loss": 1.3074, + "step": 3 + }, + { + "epoch": 0.003194888178913738, + "grad_norm": 2.2480077743530273, + "learning_rate": 0.0005, + "loss": 1.3434, + "step": 4 + }, + { + "epoch": 0.003993610223642172, + "grad_norm": 0.6822420358657837, + "learning_rate": 0.0005, + "loss": 1.2075, + "step": 5 + }, + { + "epoch": 0.004792332268370607, + "grad_norm": 0.7826036214828491, + "learning_rate": 0.0005, + "loss": 1.1844, + "step": 6 + }, + { + "epoch": 0.005591054313099041, + "grad_norm": 0.690284788608551, + "learning_rate": 0.0005, + "loss": 1.1513, + "step": 7 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 0.49136775732040405, + "learning_rate": 0.0005, + "loss": 1.1229, + "step": 8 + }, + { + "epoch": 0.00718849840255591, + "grad_norm": 0.3124309182167053, + "learning_rate": 0.0005, + "loss": 1.1105, + "step": 9 + }, + { + "epoch": 0.007987220447284345, + "grad_norm": 0.3409576714038849, + "learning_rate": 0.0005, + "loss": 1.1034, + "step": 10 + }, + { + "epoch": 0.00878594249201278, + "grad_norm": 0.25508174300193787, + "learning_rate": 0.0005, + "loss": 1.0982, + "step": 11 + }, + { + "epoch": 0.009584664536741214, + "grad_norm": 0.19042040407657623, + "learning_rate": 0.0005, + "loss": 1.0953, + "step": 12 + }, + { + "epoch": 0.010383386581469648, + "grad_norm": 0.2090323120355606, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 13 + }, + { + "epoch": 0.011182108626198083, + "grad_norm": 0.2102068066596985, + "learning_rate": 0.0005, + "loss": 1.0903, + "step": 14 + }, + { + "epoch": 0.011980830670926517, + "grad_norm": 0.12789177894592285, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 15 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 0.10204717516899109, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 16 + }, + { + "epoch": 0.013578274760383386, + "grad_norm": 0.174830362200737, + "learning_rate": 0.0005, + "loss": 1.0814, + "step": 17 + }, + { + "epoch": 0.01437699680511182, + "grad_norm": 0.25637468695640564, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 18 + }, + { + "epoch": 0.015175718849840255, + "grad_norm": 0.28002411127090454, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 19 + }, + { + "epoch": 0.01597444089456869, + "grad_norm": 0.23047354817390442, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 20 + }, + { + "epoch": 0.016773162939297124, + "grad_norm": 0.1548614650964737, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 21 + }, + { + "epoch": 0.01757188498402556, + "grad_norm": 0.07078541815280914, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 22 + }, + { + "epoch": 0.018370607028753993, + "grad_norm": 0.10615550726652145, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 23 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 0.10240291804075241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 24 + }, + { + "epoch": 0.019968051118210862, + "grad_norm": 0.07588993012905121, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 25 + }, + { + "epoch": 0.020766773162939296, + "grad_norm": 0.06380276381969452, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 26 + }, + { + "epoch": 0.02156549520766773, + "grad_norm": 0.06891524791717529, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 27 + }, + { + "epoch": 0.022364217252396165, + "grad_norm": 0.0625377744436264, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 28 + }, + { + "epoch": 0.0231629392971246, + "grad_norm": 0.12064792215824127, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 29 + }, + { + "epoch": 0.023961661341853034, + "grad_norm": 0.29220151901245117, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 30 + }, + { + "epoch": 0.02476038338658147, + "grad_norm": 0.7822219729423523, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 31 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 1.5172864198684692, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 32 + }, + { + "epoch": 0.026357827476038338, + "grad_norm": 0.18434809148311615, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 33 + }, + { + "epoch": 0.027156549520766772, + "grad_norm": 0.535632848739624, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 34 + }, + { + "epoch": 0.027955271565495207, + "grad_norm": 0.21549028158187866, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 35 + }, + { + "epoch": 0.02875399361022364, + "grad_norm": 0.4726889431476593, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 36 + }, + { + "epoch": 0.029552715654952075, + "grad_norm": 0.2519988417625427, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 37 + }, + { + "epoch": 0.03035143769968051, + "grad_norm": 0.2973701059818268, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 38 + }, + { + "epoch": 0.031150159744408944, + "grad_norm": 0.30153587460517883, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 39 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 0.08746712654829025, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 40 + }, + { + "epoch": 0.03274760383386582, + "grad_norm": 0.3308769762516022, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 41 + }, + { + "epoch": 0.03354632587859425, + "grad_norm": 0.10948555171489716, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 42 + }, + { + "epoch": 0.034345047923322686, + "grad_norm": 0.3044797480106354, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 43 + }, + { + "epoch": 0.03514376996805112, + "grad_norm": 0.11677752435207367, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 44 + }, + { + "epoch": 0.035942492012779555, + "grad_norm": 0.30327609181404114, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 45 + }, + { + "epoch": 0.036741214057507986, + "grad_norm": 0.10603009909391403, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 46 + }, + { + "epoch": 0.037539936102236424, + "grad_norm": 0.2693077623844147, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 47 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.11918680369853973, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 48 + }, + { + "epoch": 0.03913738019169329, + "grad_norm": 0.2965734899044037, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 49 + }, + { + "epoch": 0.039936102236421724, + "grad_norm": 0.10428953915834427, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 50 + }, + { + "epoch": 0.04073482428115016, + "grad_norm": 0.23307208716869354, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 51 + }, + { + "epoch": 0.04153354632587859, + "grad_norm": 0.07401563227176666, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 52 + }, + { + "epoch": 0.04233226837060703, + "grad_norm": 0.22344312071800232, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 53 + }, + { + "epoch": 0.04313099041533546, + "grad_norm": 0.1782081127166748, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 54 + }, + { + "epoch": 0.0439297124600639, + "grad_norm": 0.10123606026172638, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 55 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 0.2618716359138489, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 56 + }, + { + "epoch": 0.04552715654952077, + "grad_norm": 0.15046533942222595, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 57 + }, + { + "epoch": 0.0463258785942492, + "grad_norm": 0.1341097205877304, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 58 + }, + { + "epoch": 0.04712460063897764, + "grad_norm": 0.20391245186328888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 59 + }, + { + "epoch": 0.04792332268370607, + "grad_norm": 0.09610722959041595, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 60 + }, + { + "epoch": 0.048722044728434506, + "grad_norm": 0.09877557307481766, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 61 + }, + { + "epoch": 0.04952076677316294, + "grad_norm": 0.16971156001091003, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 62 + }, + { + "epoch": 0.050319488817891375, + "grad_norm": 0.1819174885749817, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 63 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.13067278265953064, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 64 + }, + { + "epoch": 0.051916932907348244, + "grad_norm": 0.10557633638381958, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 65 + }, + { + "epoch": 0.052715654952076675, + "grad_norm": 0.08713806420564651, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 66 + }, + { + "epoch": 0.05351437699680511, + "grad_norm": 0.12453104555606842, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 67 + }, + { + "epoch": 0.054313099041533544, + "grad_norm": 0.19147996604442596, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 68 + }, + { + "epoch": 0.05511182108626198, + "grad_norm": 0.21808673441410065, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 69 + }, + { + "epoch": 0.05591054313099041, + "grad_norm": 0.15922780334949493, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 70 + }, + { + "epoch": 0.05670926517571885, + "grad_norm": 0.09400095790624619, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 71 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 0.071605384349823, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 72 + }, + { + "epoch": 0.05830670926517572, + "grad_norm": 0.08754080533981323, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 73 + }, + { + "epoch": 0.05910543130990415, + "grad_norm": 0.07777409255504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 74 + }, + { + "epoch": 0.05990415335463259, + "grad_norm": 0.04577887803316116, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 75 + }, + { + "epoch": 0.06070287539936102, + "grad_norm": 0.07278449088335037, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 76 + }, + { + "epoch": 0.06150159744408946, + "grad_norm": 0.06739042699337006, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 77 + }, + { + "epoch": 0.06230031948881789, + "grad_norm": 0.06367938220500946, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 78 + }, + { + "epoch": 0.06309904153354633, + "grad_norm": 0.0551401786506176, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 79 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.04846199229359627, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 80 + }, + { + "epoch": 0.06469648562300319, + "grad_norm": 0.089615598320961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 81 + }, + { + "epoch": 0.06549520766773163, + "grad_norm": 0.19073566794395447, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 82 + }, + { + "epoch": 0.06629392971246006, + "grad_norm": 0.26971691846847534, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 83 + }, + { + "epoch": 0.0670926517571885, + "grad_norm": 0.3124604821205139, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 84 + }, + { + "epoch": 0.06789137380191693, + "grad_norm": 0.3448403775691986, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 85 + }, + { + "epoch": 0.06869009584664537, + "grad_norm": 0.2708166837692261, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 86 + }, + { + "epoch": 0.0694888178913738, + "grad_norm": 0.10507494956254959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 87 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 0.1015392392873764, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 88 + }, + { + "epoch": 0.07108626198083066, + "grad_norm": 0.34002622961997986, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 89 + }, + { + "epoch": 0.07188498402555911, + "grad_norm": 0.5238372683525085, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 90 + }, + { + "epoch": 0.07268370607028754, + "grad_norm": 0.5267866253852844, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 91 + }, + { + "epoch": 0.07348242811501597, + "grad_norm": 0.3286864757537842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 92 + }, + { + "epoch": 0.0742811501597444, + "grad_norm": 0.14270304143428802, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 93 + }, + { + "epoch": 0.07507987220447285, + "grad_norm": 0.3481365740299225, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 94 + }, + { + "epoch": 0.07587859424920128, + "grad_norm": 0.33883902430534363, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 95 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.2553725838661194, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 96 + }, + { + "epoch": 0.07747603833865814, + "grad_norm": 0.21944141387939453, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 97 + }, + { + "epoch": 0.07827476038338659, + "grad_norm": 0.18821558356285095, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 98 + }, + { + "epoch": 0.07907348242811502, + "grad_norm": 0.20073482394218445, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 99 + }, + { + "epoch": 0.07987220447284345, + "grad_norm": 0.2643139958381653, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 100 + }, + { + "epoch": 0.08067092651757188, + "grad_norm": 0.1843930184841156, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 101 + }, + { + "epoch": 0.08146964856230032, + "grad_norm": 0.12745684385299683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 102 + }, + { + "epoch": 0.08226837060702875, + "grad_norm": 0.3252592384815216, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 103 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 0.33775797486305237, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 104 + }, + { + "epoch": 0.08386581469648563, + "grad_norm": 0.24846483767032623, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 105 + }, + { + "epoch": 0.08466453674121406, + "grad_norm": 0.1598653495311737, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 106 + }, + { + "epoch": 0.08546325878594249, + "grad_norm": 0.2555698752403259, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 107 + }, + { + "epoch": 0.08626198083067092, + "grad_norm": 0.3770487308502197, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 108 + }, + { + "epoch": 0.08706070287539937, + "grad_norm": 0.3179391026496887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 109 + }, + { + "epoch": 0.0878594249201278, + "grad_norm": 0.11638858914375305, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 110 + }, + { + "epoch": 0.08865814696485623, + "grad_norm": 0.20365215837955475, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 111 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.22354111075401306, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 112 + }, + { + "epoch": 0.0902555910543131, + "grad_norm": 0.1944236010313034, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 113 + }, + { + "epoch": 0.09105431309904154, + "grad_norm": 0.16177603602409363, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 114 + }, + { + "epoch": 0.09185303514376997, + "grad_norm": 0.06650812178850174, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 115 + }, + { + "epoch": 0.0926517571884984, + "grad_norm": 0.20236945152282715, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 116 + }, + { + "epoch": 0.09345047923322684, + "grad_norm": 0.19086670875549316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 117 + }, + { + "epoch": 0.09424920127795527, + "grad_norm": 0.17380473017692566, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 118 + }, + { + "epoch": 0.0950479233226837, + "grad_norm": 0.11360115557909012, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 119 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 0.09359298646450043, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 120 + }, + { + "epoch": 0.09664536741214058, + "grad_norm": 0.15317411720752716, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 121 + }, + { + "epoch": 0.09744408945686901, + "grad_norm": 0.05564137175679207, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 122 + }, + { + "epoch": 0.09824281150159744, + "grad_norm": 0.13476046919822693, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 123 + }, + { + "epoch": 0.09904153354632587, + "grad_norm": 0.11372318118810654, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 124 + }, + { + "epoch": 0.09984025559105432, + "grad_norm": 0.11330179125070572, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 125 + }, + { + "epoch": 0.10063897763578275, + "grad_norm": 0.11304716765880585, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 126 + }, + { + "epoch": 0.10143769968051118, + "grad_norm": 0.06369871646165848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 127 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.14034464955329895, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 128 + }, + { + "epoch": 0.10303514376996806, + "grad_norm": 0.1080808937549591, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 129 + }, + { + "epoch": 0.10383386581469649, + "grad_norm": 0.09568007290363312, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 130 + }, + { + "epoch": 0.10463258785942492, + "grad_norm": 0.1359473019838333, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 131 + }, + { + "epoch": 0.10543130990415335, + "grad_norm": 0.06500346213579178, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 132 + }, + { + "epoch": 0.1062300319488818, + "grad_norm": 0.11564832180738449, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 133 + }, + { + "epoch": 0.10702875399361023, + "grad_norm": 0.2115149199962616, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 134 + }, + { + "epoch": 0.10782747603833866, + "grad_norm": 0.3098243772983551, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 135 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 0.446521133184433, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 136 + }, + { + "epoch": 0.10942492012779553, + "grad_norm": 0.5194831490516663, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 137 + }, + { + "epoch": 0.11022364217252396, + "grad_norm": 0.447731077671051, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 138 + }, + { + "epoch": 0.1110223642172524, + "grad_norm": 0.2195945680141449, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 139 + }, + { + "epoch": 0.11182108626198083, + "grad_norm": 0.1277567446231842, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 140 + }, + { + "epoch": 0.11261980830670927, + "grad_norm": 0.3284558355808258, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 141 + }, + { + "epoch": 0.1134185303514377, + "grad_norm": 0.40208715200424194, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 142 + }, + { + "epoch": 0.11421725239616613, + "grad_norm": 0.28310486674308777, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 143 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.0786294937133789, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 144 + }, + { + "epoch": 0.11581469648562301, + "grad_norm": 0.18283484876155853, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 145 + }, + { + "epoch": 0.11661341853035144, + "grad_norm": 0.20186439156532288, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 146 + }, + { + "epoch": 0.11741214057507987, + "grad_norm": 0.15860706567764282, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 147 + }, + { + "epoch": 0.1182108626198083, + "grad_norm": 0.1436982899904251, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 148 + }, + { + "epoch": 0.11900958466453675, + "grad_norm": 0.15206722915172577, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 149 + }, + { + "epoch": 0.11980830670926518, + "grad_norm": 0.252279132604599, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 150 + }, + { + "epoch": 0.12060702875399361, + "grad_norm": 0.19411228597164154, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 151 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 0.07377714663743973, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 152 + }, + { + "epoch": 0.12220447284345048, + "grad_norm": 0.15493856370449066, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 153 + }, + { + "epoch": 0.12300319488817892, + "grad_norm": 0.275601863861084, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 154 + }, + { + "epoch": 0.12380191693290735, + "grad_norm": 0.42461103200912476, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 155 + }, + { + "epoch": 0.12460063897763578, + "grad_norm": 0.41153159737586975, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 156 + }, + { + "epoch": 0.1253993610223642, + "grad_norm": 0.2487967610359192, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 157 + }, + { + "epoch": 0.12619808306709265, + "grad_norm": 0.10687623918056488, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 158 + }, + { + "epoch": 0.1269968051118211, + "grad_norm": 0.28695282340049744, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 159 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.38554099202156067, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 160 + }, + { + "epoch": 0.12859424920127796, + "grad_norm": 0.25622498989105225, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 161 + }, + { + "epoch": 0.12939297124600638, + "grad_norm": 0.10341542959213257, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 162 + }, + { + "epoch": 0.13019169329073482, + "grad_norm": 0.20450755953788757, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 163 + }, + { + "epoch": 0.13099041533546327, + "grad_norm": 0.2664271295070648, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 164 + }, + { + "epoch": 0.13178913738019168, + "grad_norm": 0.23936089873313904, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 165 + }, + { + "epoch": 0.13258785942492013, + "grad_norm": 0.0662769302725792, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 166 + }, + { + "epoch": 0.13338658146964857, + "grad_norm": 0.13597780466079712, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 167 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 0.15996500849723816, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 168 + }, + { + "epoch": 0.13498402555910544, + "grad_norm": 0.10095447301864624, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 169 + }, + { + "epoch": 0.13578274760383385, + "grad_norm": 0.09733449667692184, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 170 + }, + { + "epoch": 0.1365814696485623, + "grad_norm": 0.16480964422225952, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 171 + }, + { + "epoch": 0.13738019169329074, + "grad_norm": 0.21611596643924713, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 172 + }, + { + "epoch": 0.13817891373801916, + "grad_norm": 0.21607941389083862, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 173 + }, + { + "epoch": 0.1389776357827476, + "grad_norm": 0.2234959453344345, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 174 + }, + { + "epoch": 0.13977635782747605, + "grad_norm": 0.10778137296438217, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 175 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.1758418083190918, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 176 + }, + { + "epoch": 0.1413738019169329, + "grad_norm": 0.30717936158180237, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 177 + }, + { + "epoch": 0.14217252396166133, + "grad_norm": 0.3382156789302826, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 178 + }, + { + "epoch": 0.14297124600638977, + "grad_norm": 0.23189185559749603, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 179 + }, + { + "epoch": 0.14376996805111822, + "grad_norm": 0.04988733306527138, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 180 + }, + { + "epoch": 0.14456869009584664, + "grad_norm": 0.15606579184532166, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 181 + }, + { + "epoch": 0.14536741214057508, + "grad_norm": 0.2366417795419693, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 182 + }, + { + "epoch": 0.14616613418530353, + "grad_norm": 0.21878089010715485, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 183 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 0.09316077083349228, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 184 + }, + { + "epoch": 0.1477635782747604, + "grad_norm": 0.119263656437397, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 185 + }, + { + "epoch": 0.1485623003194888, + "grad_norm": 0.26743847131729126, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 186 + }, + { + "epoch": 0.14936102236421725, + "grad_norm": 0.34438276290893555, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 187 + }, + { + "epoch": 0.1501597444089457, + "grad_norm": 0.30809128284454346, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 188 + }, + { + "epoch": 0.1509584664536741, + "grad_norm": 0.1406010240316391, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 189 + }, + { + "epoch": 0.15175718849840256, + "grad_norm": 0.09509757161140442, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 190 + }, + { + "epoch": 0.152555910543131, + "grad_norm": 0.24529854953289032, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 191 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.2803219258785248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 192 + }, + { + "epoch": 0.15415335463258786, + "grad_norm": 0.18221652507781982, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 193 + }, + { + "epoch": 0.15495207667731628, + "grad_norm": 0.04752795770764351, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 194 + }, + { + "epoch": 0.15575079872204473, + "grad_norm": 0.14151020348072052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 195 + }, + { + "epoch": 0.15654952076677317, + "grad_norm": 0.27345412969589233, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 196 + }, + { + "epoch": 0.1573482428115016, + "grad_norm": 0.36259710788726807, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 197 + }, + { + "epoch": 0.15814696485623003, + "grad_norm": 0.30899694561958313, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 198 + }, + { + "epoch": 0.15894568690095848, + "grad_norm": 0.148394376039505, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 199 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 0.09150427579879761, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 200 + }, + { + "epoch": 0.16054313099041534, + "grad_norm": 0.2579229176044464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 201 + }, + { + "epoch": 0.16134185303514376, + "grad_norm": 0.35417553782463074, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 202 + }, + { + "epoch": 0.1621405750798722, + "grad_norm": 0.3410634994506836, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 203 + }, + { + "epoch": 0.16293929712460065, + "grad_norm": 0.20597697794437408, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 204 + }, + { + "epoch": 0.16373801916932906, + "grad_norm": 0.09722702950239182, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 205 + }, + { + "epoch": 0.1645367412140575, + "grad_norm": 0.29214075207710266, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 206 + }, + { + "epoch": 0.16533546325878595, + "grad_norm": 0.35695526003837585, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 207 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.23948919773101807, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 208 + }, + { + "epoch": 0.16693290734824281, + "grad_norm": 0.06467479467391968, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 209 + }, + { + "epoch": 0.16773162939297126, + "grad_norm": 0.2935601472854614, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 210 + }, + { + "epoch": 0.16853035143769968, + "grad_norm": 0.3354688882827759, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 211 + }, + { + "epoch": 0.16932907348242812, + "grad_norm": 0.206736221909523, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 212 + }, + { + "epoch": 0.17012779552715654, + "grad_norm": 0.04770192503929138, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 213 + }, + { + "epoch": 0.17092651757188498, + "grad_norm": 0.11713571101427078, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 214 + }, + { + "epoch": 0.17172523961661343, + "grad_norm": 0.1751943975687027, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 215 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 0.11709283292293549, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 216 + }, + { + "epoch": 0.1733226837060703, + "grad_norm": 0.08393140882253647, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 217 + }, + { + "epoch": 0.17412140575079874, + "grad_norm": 0.14036497473716736, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 218 + }, + { + "epoch": 0.17492012779552715, + "grad_norm": 0.19809649884700775, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 219 + }, + { + "epoch": 0.1757188498402556, + "grad_norm": 0.16380994021892548, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 220 + }, + { + "epoch": 0.17651757188498401, + "grad_norm": 0.03721015155315399, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 221 + }, + { + "epoch": 0.17731629392971246, + "grad_norm": 0.16769659519195557, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 222 + }, + { + "epoch": 0.1781150159744409, + "grad_norm": 0.2506882846355438, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 223 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 0.2812851667404175, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 224 + }, + { + "epoch": 0.17971246006389777, + "grad_norm": 0.2518095374107361, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 225 + }, + { + "epoch": 0.1805111821086262, + "grad_norm": 0.13027259707450867, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 226 + }, + { + "epoch": 0.18130990415335463, + "grad_norm": 0.051758985966444016, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 227 + }, + { + "epoch": 0.18210862619808307, + "grad_norm": 0.123250812292099, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 228 + }, + { + "epoch": 0.1829073482428115, + "grad_norm": 0.16475827991962433, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 229 + }, + { + "epoch": 0.18370607028753994, + "grad_norm": 0.15224772691726685, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 230 + }, + { + "epoch": 0.18450479233226838, + "grad_norm": 0.10693283379077911, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 231 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 0.059128716588020325, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 232 + }, + { + "epoch": 0.18610223642172524, + "grad_norm": 0.1689453125, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 233 + }, + { + "epoch": 0.1869009584664537, + "grad_norm": 0.21447211503982544, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 234 + }, + { + "epoch": 0.1876996805111821, + "grad_norm": 0.214809849858284, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 235 + }, + { + "epoch": 0.18849840255591055, + "grad_norm": 0.16398873925209045, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 236 + }, + { + "epoch": 0.18929712460063897, + "grad_norm": 0.08273304253816605, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 237 + }, + { + "epoch": 0.1900958466453674, + "grad_norm": 0.08456159383058548, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 238 + }, + { + "epoch": 0.19089456869009586, + "grad_norm": 0.09653522819280624, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 239 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.13169406354427338, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 240 + }, + { + "epoch": 0.19249201277955272, + "grad_norm": 0.2328217476606369, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 241 + }, + { + "epoch": 0.19329073482428116, + "grad_norm": 0.2226463258266449, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 242 + }, + { + "epoch": 0.19408945686900958, + "grad_norm": 0.13330090045928955, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 243 + }, + { + "epoch": 0.19488817891373802, + "grad_norm": 0.15685412287712097, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 244 + }, + { + "epoch": 0.19568690095846644, + "grad_norm": 0.1528809666633606, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 245 + }, + { + "epoch": 0.1964856230031949, + "grad_norm": 0.2380320429801941, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 246 + }, + { + "epoch": 0.19728434504792333, + "grad_norm": 0.20447947084903717, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 247 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 0.162733793258667, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 248 + }, + { + "epoch": 0.1988817891373802, + "grad_norm": 0.10536827147006989, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 249 + }, + { + "epoch": 0.19968051118210864, + "grad_norm": 0.05464514344930649, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 250 + }, + { + "epoch": 0.20047923322683706, + "grad_norm": 0.052793700248003006, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 251 + }, + { + "epoch": 0.2012779552715655, + "grad_norm": 0.06936854124069214, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 252 + }, + { + "epoch": 0.20207667731629392, + "grad_norm": 0.17630355060100555, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 253 + }, + { + "epoch": 0.20287539936102236, + "grad_norm": 0.23443830013275146, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 254 + }, + { + "epoch": 0.2036741214057508, + "grad_norm": 0.21788854897022247, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 255 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.16827379167079926, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 256 + }, + { + "epoch": 0.20527156549520767, + "grad_norm": 0.08467451483011246, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 257 + }, + { + "epoch": 0.20607028753993611, + "grad_norm": 0.17747341096401215, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 258 + }, + { + "epoch": 0.20686900958466453, + "grad_norm": 0.20212751626968384, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 259 + }, + { + "epoch": 0.20766773162939298, + "grad_norm": 0.13319599628448486, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 260 + }, + { + "epoch": 0.2084664536741214, + "grad_norm": 0.13839752972126007, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 261 + }, + { + "epoch": 0.20926517571884984, + "grad_norm": 0.12351422011852264, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 262 + }, + { + "epoch": 0.21006389776357828, + "grad_norm": 0.1166408434510231, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 263 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 0.15500681102275848, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 264 + }, + { + "epoch": 0.21166134185303515, + "grad_norm": 0.045156076550483704, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 265 + }, + { + "epoch": 0.2124600638977636, + "grad_norm": 0.1413601189851761, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 266 + }, + { + "epoch": 0.213258785942492, + "grad_norm": 0.19309845566749573, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 267 + }, + { + "epoch": 0.21405750798722045, + "grad_norm": 0.22837650775909424, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 268 + }, + { + "epoch": 0.21485623003194887, + "grad_norm": 0.23372405767440796, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 269 + }, + { + "epoch": 0.21565495207667731, + "grad_norm": 0.2030618041753769, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 270 + }, + { + "epoch": 0.21645367412140576, + "grad_norm": 0.2092818021774292, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 271 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.18329963088035583, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 272 + }, + { + "epoch": 0.21805111821086262, + "grad_norm": 0.07353675365447998, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 273 + }, + { + "epoch": 0.21884984025559107, + "grad_norm": 0.08853492140769958, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 274 + }, + { + "epoch": 0.21964856230031948, + "grad_norm": 0.14666804671287537, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 275 + }, + { + "epoch": 0.22044728434504793, + "grad_norm": 0.12529602646827698, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 276 + }, + { + "epoch": 0.22124600638977635, + "grad_norm": 0.1571074277162552, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 277 + }, + { + "epoch": 0.2220447284345048, + "grad_norm": 0.09636949002742767, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 278 + }, + { + "epoch": 0.22284345047923323, + "grad_norm": 0.16803453862667084, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 279 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 0.258849561214447, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 280 + }, + { + "epoch": 0.2244408945686901, + "grad_norm": 0.29162102937698364, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 281 + }, + { + "epoch": 0.22523961661341854, + "grad_norm": 0.32085782289505005, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 282 + }, + { + "epoch": 0.22603833865814696, + "grad_norm": 0.24114084243774414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 283 + }, + { + "epoch": 0.2268370607028754, + "grad_norm": 0.11804991215467453, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 284 + }, + { + "epoch": 0.22763578274760382, + "grad_norm": 0.16640789806842804, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 285 + }, + { + "epoch": 0.22843450479233227, + "grad_norm": 0.33951282501220703, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 286 + }, + { + "epoch": 0.2292332268370607, + "grad_norm": 0.3939269483089447, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 287 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.2742229402065277, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 288 + }, + { + "epoch": 0.23083067092651757, + "grad_norm": 0.1000385507941246, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 289 + }, + { + "epoch": 0.23162939297124602, + "grad_norm": 0.15618765354156494, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 290 + }, + { + "epoch": 0.23242811501597443, + "grad_norm": 0.3464474081993103, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 291 + }, + { + "epoch": 0.23322683706070288, + "grad_norm": 0.4524421989917755, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 292 + }, + { + "epoch": 0.2340255591054313, + "grad_norm": 0.38890203833580017, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 293 + }, + { + "epoch": 0.23482428115015974, + "grad_norm": 0.15225796401500702, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 294 + }, + { + "epoch": 0.2356230031948882, + "grad_norm": 0.18742015957832336, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 295 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 0.454607754945755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 296 + }, + { + "epoch": 0.23722044728434505, + "grad_norm": 0.4426102638244629, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 297 + }, + { + "epoch": 0.2380191693290735, + "grad_norm": 0.1442587673664093, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 298 + }, + { + "epoch": 0.2388178913738019, + "grad_norm": 0.2338172197341919, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 299 + }, + { + "epoch": 0.23961661341853036, + "grad_norm": 0.4115936756134033, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 300 + }, + { + "epoch": 0.24041533546325877, + "grad_norm": 0.38746342062950134, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 301 + }, + { + "epoch": 0.24121405750798722, + "grad_norm": 0.11506912112236023, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 302 + }, + { + "epoch": 0.24201277955271566, + "grad_norm": 0.20454810559749603, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 303 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.34620603919029236, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 304 + }, + { + "epoch": 0.24361022364217252, + "grad_norm": 0.27727624773979187, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 305 + }, + { + "epoch": 0.24440894568690097, + "grad_norm": 0.062395140528678894, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 306 + }, + { + "epoch": 0.2452076677316294, + "grad_norm": 0.25391891598701477, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 307 + }, + { + "epoch": 0.24600638977635783, + "grad_norm": 0.3807840049266815, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 308 + }, + { + "epoch": 0.24680511182108625, + "grad_norm": 0.31564414501190186, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 309 + }, + { + "epoch": 0.2476038338658147, + "grad_norm": 0.044667672365903854, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 310 + }, + { + "epoch": 0.24840255591054314, + "grad_norm": 0.2656041979789734, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 311 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 0.2954655587673187, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 312 + }, + { + "epoch": 0.25, + "grad_norm": 0.14636820554733276, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 313 + }, + { + "epoch": 0.2507987220447284, + "grad_norm": 0.16759099066257477, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 314 + }, + { + "epoch": 0.2515974440894569, + "grad_norm": 0.28777605295181274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 315 + }, + { + "epoch": 0.2523961661341853, + "grad_norm": 0.2817089855670929, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 316 + }, + { + "epoch": 0.2531948881789137, + "grad_norm": 0.09457004815340042, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 317 + }, + { + "epoch": 0.2539936102236422, + "grad_norm": 0.15224558115005493, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 318 + }, + { + "epoch": 0.2547923322683706, + "grad_norm": 0.17883236706256866, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 319 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.08269336074590683, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 320 + }, + { + "epoch": 0.2563897763578275, + "grad_norm": 0.10430650413036346, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 321 + }, + { + "epoch": 0.2571884984025559, + "grad_norm": 0.06464210897684097, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 322 + }, + { + "epoch": 0.25798722044728434, + "grad_norm": 0.08100844919681549, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 323 + }, + { + "epoch": 0.25878594249201275, + "grad_norm": 0.10375291109085083, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 324 + }, + { + "epoch": 0.2595846645367412, + "grad_norm": 0.14621509611606598, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 325 + }, + { + "epoch": 0.26038338658146964, + "grad_norm": 0.12707975506782532, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 326 + }, + { + "epoch": 0.26118210862619806, + "grad_norm": 0.04542430862784386, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 327 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 0.13504259288311005, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 328 + }, + { + "epoch": 0.26277955271565495, + "grad_norm": 0.20337320864200592, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 329 + }, + { + "epoch": 0.26357827476038337, + "grad_norm": 0.23682020604610443, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 330 + }, + { + "epoch": 0.26437699680511184, + "grad_norm": 0.15198387205600739, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 331 + }, + { + "epoch": 0.26517571884984026, + "grad_norm": 0.04014969989657402, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 332 + }, + { + "epoch": 0.2659744408945687, + "grad_norm": 0.10505357384681702, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 333 + }, + { + "epoch": 0.26677316293929715, + "grad_norm": 0.08121145516633987, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 334 + }, + { + "epoch": 0.26757188498402557, + "grad_norm": 0.062118109315633774, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 335 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.13389311730861664, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 336 + }, + { + "epoch": 0.26916932907348246, + "grad_norm": 0.24840199947357178, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 337 + }, + { + "epoch": 0.26996805111821087, + "grad_norm": 0.33511659502983093, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 338 + }, + { + "epoch": 0.2707667731629393, + "grad_norm": 0.2905866801738739, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 339 + }, + { + "epoch": 0.2715654952076677, + "grad_norm": 0.15471668541431427, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 340 + }, + { + "epoch": 0.2723642172523962, + "grad_norm": 0.09973842650651932, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 341 + }, + { + "epoch": 0.2731629392971246, + "grad_norm": 0.19315758347511292, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 342 + }, + { + "epoch": 0.273961661341853, + "grad_norm": 0.2122231423854828, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 343 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 0.11207931488752365, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 344 + }, + { + "epoch": 0.2755591054313099, + "grad_norm": 0.11863203346729279, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 345 + }, + { + "epoch": 0.2763578274760383, + "grad_norm": 0.22022183239459991, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 346 + }, + { + "epoch": 0.2771565495207668, + "grad_norm": 0.225724458694458, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 347 + }, + { + "epoch": 0.2779552715654952, + "grad_norm": 0.1622191071510315, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 348 + }, + { + "epoch": 0.2787539936102236, + "grad_norm": 0.05987359210848808, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 349 + }, + { + "epoch": 0.2795527156549521, + "grad_norm": 0.08514829725027084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 350 + }, + { + "epoch": 0.2803514376996805, + "grad_norm": 0.10734611004590988, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 351 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.12458663433790207, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 352 + }, + { + "epoch": 0.2819488817891374, + "grad_norm": 0.12223048508167267, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 353 + }, + { + "epoch": 0.2827476038338658, + "grad_norm": 0.0663333311676979, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 354 + }, + { + "epoch": 0.28354632587859424, + "grad_norm": 0.0628359317779541, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 355 + }, + { + "epoch": 0.28434504792332266, + "grad_norm": 0.1566074788570404, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 356 + }, + { + "epoch": 0.28514376996805113, + "grad_norm": 0.23291122913360596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 357 + }, + { + "epoch": 0.28594249201277955, + "grad_norm": 0.21403467655181885, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 358 + }, + { + "epoch": 0.28674121405750796, + "grad_norm": 0.08412498980760574, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 359 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 0.1415901631116867, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 360 + }, + { + "epoch": 0.28833865814696485, + "grad_norm": 0.29960349202156067, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 361 + }, + { + "epoch": 0.28913738019169327, + "grad_norm": 0.33849450945854187, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 362 + }, + { + "epoch": 0.28993610223642174, + "grad_norm": 0.24428068101406097, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 363 + }, + { + "epoch": 0.29073482428115016, + "grad_norm": 0.07897785305976868, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 364 + }, + { + "epoch": 0.2915335463258786, + "grad_norm": 0.1347426027059555, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 365 + }, + { + "epoch": 0.29233226837060705, + "grad_norm": 0.21387724578380585, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 366 + }, + { + "epoch": 0.29313099041533547, + "grad_norm": 0.13869348168373108, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 367 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.062060993164777756, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 368 + }, + { + "epoch": 0.29472843450479236, + "grad_norm": 0.13848915696144104, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 369 + }, + { + "epoch": 0.2955271565495208, + "grad_norm": 0.12179117649793625, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 370 + }, + { + "epoch": 0.2963258785942492, + "grad_norm": 0.13039280474185944, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 371 + }, + { + "epoch": 0.2971246006389776, + "grad_norm": 0.09119348227977753, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 372 + }, + { + "epoch": 0.2979233226837061, + "grad_norm": 0.06374438107013702, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 373 + }, + { + "epoch": 0.2987220447284345, + "grad_norm": 0.1524113267660141, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 374 + }, + { + "epoch": 0.2995207667731629, + "grad_norm": 0.18103912472724915, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 375 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 0.1439986377954483, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 376 + }, + { + "epoch": 0.3011182108626198, + "grad_norm": 0.1268371045589447, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 377 + }, + { + "epoch": 0.3019169329073482, + "grad_norm": 0.07370569556951523, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 378 + }, + { + "epoch": 0.3027156549520767, + "grad_norm": 0.0718536451458931, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 379 + }, + { + "epoch": 0.3035143769968051, + "grad_norm": 0.10444384068250656, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 380 + }, + { + "epoch": 0.30431309904153353, + "grad_norm": 0.10085552930831909, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 381 + }, + { + "epoch": 0.305111821086262, + "grad_norm": 0.08599484711885452, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 382 + }, + { + "epoch": 0.3059105431309904, + "grad_norm": 0.08912923187017441, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 383 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.17919759452342987, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 384 + }, + { + "epoch": 0.3075079872204473, + "grad_norm": 0.23954501748085022, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 385 + }, + { + "epoch": 0.3083067092651757, + "grad_norm": 0.2940942645072937, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 386 + }, + { + "epoch": 0.30910543130990414, + "grad_norm": 0.2905970513820648, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 387 + }, + { + "epoch": 0.30990415335463256, + "grad_norm": 0.2555491626262665, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 388 + }, + { + "epoch": 0.31070287539936103, + "grad_norm": 0.15303272008895874, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 389 + }, + { + "epoch": 0.31150159744408945, + "grad_norm": 0.10148895531892776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 390 + }, + { + "epoch": 0.31230031948881787, + "grad_norm": 0.21828792989253998, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 391 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 0.27219685912132263, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 392 + }, + { + "epoch": 0.31389776357827476, + "grad_norm": 0.3431699872016907, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 393 + }, + { + "epoch": 0.3146964856230032, + "grad_norm": 0.32346805930137634, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 394 + }, + { + "epoch": 0.31549520766773165, + "grad_norm": 0.17791730165481567, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 395 + }, + { + "epoch": 0.31629392971246006, + "grad_norm": 0.09576063603162766, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 396 + }, + { + "epoch": 0.3170926517571885, + "grad_norm": 0.050598498433828354, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 397 + }, + { + "epoch": 0.31789137380191695, + "grad_norm": 0.07385009527206421, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 398 + }, + { + "epoch": 0.31869009584664537, + "grad_norm": 0.08680527657270432, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 399 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.06436332315206528, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 400 + }, + { + "epoch": 0.32028753993610226, + "grad_norm": 0.05943639203906059, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 401 + }, + { + "epoch": 0.3210862619808307, + "grad_norm": 0.10015929490327835, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 402 + }, + { + "epoch": 0.3218849840255591, + "grad_norm": 0.07852698862552643, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 403 + }, + { + "epoch": 0.3226837060702875, + "grad_norm": 0.06103534996509552, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 404 + }, + { + "epoch": 0.323482428115016, + "grad_norm": 0.04573113098740578, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 405 + }, + { + "epoch": 0.3242811501597444, + "grad_norm": 0.06108849495649338, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 406 + }, + { + "epoch": 0.3250798722044728, + "grad_norm": 0.10209841281175613, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 407 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 0.0956021398305893, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 408 + }, + { + "epoch": 0.3266773162939297, + "grad_norm": 0.12572422623634338, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 409 + }, + { + "epoch": 0.3274760383386581, + "grad_norm": 0.1532585173845291, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 410 + }, + { + "epoch": 0.3282747603833866, + "grad_norm": 0.10664337128400803, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 411 + }, + { + "epoch": 0.329073482428115, + "grad_norm": 0.07705336064100266, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 412 + }, + { + "epoch": 0.32987220447284343, + "grad_norm": 0.08611477166414261, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 413 + }, + { + "epoch": 0.3306709265175719, + "grad_norm": 0.11460789293050766, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 414 + }, + { + "epoch": 0.3314696485623003, + "grad_norm": 0.1214505136013031, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 415 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.07482243329286575, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 416 + }, + { + "epoch": 0.3330670926517572, + "grad_norm": 0.05022026225924492, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 417 + }, + { + "epoch": 0.33386581469648563, + "grad_norm": 0.086161769926548, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 418 + }, + { + "epoch": 0.33466453674121405, + "grad_norm": 0.05073339864611626, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 419 + }, + { + "epoch": 0.3354632587859425, + "grad_norm": 0.0925290584564209, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 420 + }, + { + "epoch": 0.33626198083067094, + "grad_norm": 0.08073565363883972, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 421 + }, + { + "epoch": 0.33706070287539935, + "grad_norm": 0.06067343428730965, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 422 + }, + { + "epoch": 0.33785942492012777, + "grad_norm": 0.16081079840660095, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 423 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 0.3043743371963501, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 424 + }, + { + "epoch": 0.33945686900958466, + "grad_norm": 0.32498979568481445, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 425 + }, + { + "epoch": 0.3402555910543131, + "grad_norm": 0.206096351146698, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 426 + }, + { + "epoch": 0.34105431309904155, + "grad_norm": 0.11892937123775482, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 427 + }, + { + "epoch": 0.34185303514376997, + "grad_norm": 0.19896888732910156, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 428 + }, + { + "epoch": 0.3426517571884984, + "grad_norm": 0.3295411169528961, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 429 + }, + { + "epoch": 0.34345047923322686, + "grad_norm": 0.3841599225997925, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 430 + }, + { + "epoch": 0.3442492012779553, + "grad_norm": 0.36113840341567993, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 431 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.25694623589515686, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 432 + }, + { + "epoch": 0.34584664536741216, + "grad_norm": 0.07741750776767731, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 433 + }, + { + "epoch": 0.3466453674121406, + "grad_norm": 0.1385476440191269, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 434 + }, + { + "epoch": 0.347444089456869, + "grad_norm": 0.22972947359085083, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 435 + }, + { + "epoch": 0.34824281150159747, + "grad_norm": 0.15720337629318237, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 436 + }, + { + "epoch": 0.3490415335463259, + "grad_norm": 0.04451138526201248, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 437 + }, + { + "epoch": 0.3498402555910543, + "grad_norm": 0.15054486691951752, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 438 + }, + { + "epoch": 0.3506389776357827, + "grad_norm": 0.16740895807743073, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 439 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 0.1388419270515442, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 440 + }, + { + "epoch": 0.3522364217252396, + "grad_norm": 0.06480700522661209, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 441 + }, + { + "epoch": 0.35303514376996803, + "grad_norm": 0.09604794532060623, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 442 + }, + { + "epoch": 0.3538338658146965, + "grad_norm": 0.174916610121727, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 443 + }, + { + "epoch": 0.3546325878594249, + "grad_norm": 0.2228047251701355, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 444 + }, + { + "epoch": 0.35543130990415334, + "grad_norm": 0.24461773037910461, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 445 + }, + { + "epoch": 0.3562300319488818, + "grad_norm": 0.2201017141342163, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 446 + }, + { + "epoch": 0.3570287539936102, + "grad_norm": 0.11596337705850601, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 447 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.1682164967060089, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 448 + }, + { + "epoch": 0.3586261980830671, + "grad_norm": 0.4297041594982147, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 449 + }, + { + "epoch": 0.35942492012779553, + "grad_norm": 0.5659548044204712, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 450 + }, + { + "epoch": 0.36022364217252395, + "grad_norm": 0.5303114652633667, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 451 + }, + { + "epoch": 0.3610223642172524, + "grad_norm": 0.23788955807685852, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 452 + }, + { + "epoch": 0.36182108626198084, + "grad_norm": 0.15622566640377045, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 453 + }, + { + "epoch": 0.36261980830670926, + "grad_norm": 0.327275812625885, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 454 + }, + { + "epoch": 0.3634185303514377, + "grad_norm": 0.23511037230491638, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 455 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 0.11690831184387207, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 456 + }, + { + "epoch": 0.36501597444089456, + "grad_norm": 0.17950886487960815, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 457 + }, + { + "epoch": 0.365814696485623, + "grad_norm": 0.13816051185131073, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 458 + }, + { + "epoch": 0.36661341853035145, + "grad_norm": 0.09056458622217178, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 459 + }, + { + "epoch": 0.36741214057507987, + "grad_norm": 0.1648412048816681, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 460 + }, + { + "epoch": 0.3682108626198083, + "grad_norm": 0.24407249689102173, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 461 + }, + { + "epoch": 0.36900958466453676, + "grad_norm": 0.1896992176771164, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 462 + }, + { + "epoch": 0.3698083067092652, + "grad_norm": 0.07938385009765625, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 463 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.10241381078958511, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 464 + }, + { + "epoch": 0.37140575079872207, + "grad_norm": 0.14765797555446625, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 465 + }, + { + "epoch": 0.3722044728434505, + "grad_norm": 0.11189796775579453, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 466 + }, + { + "epoch": 0.3730031948881789, + "grad_norm": 0.05604114383459091, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 467 + }, + { + "epoch": 0.3738019169329074, + "grad_norm": 0.18633529543876648, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 468 + }, + { + "epoch": 0.3746006389776358, + "grad_norm": 0.2587120234966278, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 469 + }, + { + "epoch": 0.3753993610223642, + "grad_norm": 0.21629218757152557, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 470 + }, + { + "epoch": 0.3761980830670926, + "grad_norm": 0.11872006952762604, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 471 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 0.07732011377811432, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 472 + }, + { + "epoch": 0.3777955271565495, + "grad_norm": 0.20141537487506866, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 473 + }, + { + "epoch": 0.37859424920127793, + "grad_norm": 0.26726409792900085, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 474 + }, + { + "epoch": 0.3793929712460064, + "grad_norm": 0.2373354583978653, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 475 + }, + { + "epoch": 0.3801916932907348, + "grad_norm": 0.15030571818351746, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 476 + }, + { + "epoch": 0.38099041533546324, + "grad_norm": 0.05345006287097931, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 477 + }, + { + "epoch": 0.3817891373801917, + "grad_norm": 0.12551648914813995, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 478 + }, + { + "epoch": 0.38258785942492013, + "grad_norm": 0.14036186039447784, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 479 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.09807970374822617, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 480 + }, + { + "epoch": 0.384185303514377, + "grad_norm": 0.05071088671684265, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 481 + }, + { + "epoch": 0.38498402555910544, + "grad_norm": 0.07541649043560028, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 482 + }, + { + "epoch": 0.38578274760383385, + "grad_norm": 0.059762127697467804, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 483 + }, + { + "epoch": 0.3865814696485623, + "grad_norm": 0.05540496110916138, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 484 + }, + { + "epoch": 0.38738019169329074, + "grad_norm": 0.09137953072786331, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 485 + }, + { + "epoch": 0.38817891373801916, + "grad_norm": 0.1349237710237503, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 486 + }, + { + "epoch": 0.3889776357827476, + "grad_norm": 0.13889296352863312, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 487 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 0.16406965255737305, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 488 + }, + { + "epoch": 0.39057507987220447, + "grad_norm": 0.1748959869146347, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 489 + }, + { + "epoch": 0.3913738019169329, + "grad_norm": 0.1518068015575409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 490 + }, + { + "epoch": 0.39217252396166136, + "grad_norm": 0.06694433838129044, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 491 + }, + { + "epoch": 0.3929712460063898, + "grad_norm": 0.11556574702262878, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 492 + }, + { + "epoch": 0.3937699680511182, + "grad_norm": 0.2562897801399231, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 493 + }, + { + "epoch": 0.39456869009584666, + "grad_norm": 0.30842337012290955, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 494 + }, + { + "epoch": 0.3953674121405751, + "grad_norm": 0.30477815866470337, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 495 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.2602941691875458, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 496 + }, + { + "epoch": 0.39696485623003197, + "grad_norm": 0.1692838817834854, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 497 + }, + { + "epoch": 0.3977635782747604, + "grad_norm": 0.07468903064727783, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 498 + }, + { + "epoch": 0.3985623003194888, + "grad_norm": 0.05872616916894913, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 499 + }, + { + "epoch": 0.3993610223642173, + "grad_norm": 0.09878433495759964, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 500 + }, + { + "epoch": 0.4001597444089457, + "grad_norm": 0.13779069483280182, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 501 + }, + { + "epoch": 0.4009584664536741, + "grad_norm": 0.17778213322162628, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 502 + }, + { + "epoch": 0.40175718849840253, + "grad_norm": 0.15572750568389893, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 503 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 0.1154002770781517, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 504 + }, + { + "epoch": 0.4033546325878594, + "grad_norm": 0.04485362395644188, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 505 + }, + { + "epoch": 0.40415335463258784, + "grad_norm": 0.07514321058988571, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 506 + }, + { + "epoch": 0.4049520766773163, + "grad_norm": 0.13954220712184906, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 507 + }, + { + "epoch": 0.4057507987220447, + "grad_norm": 0.20726922154426575, + "learning_rate": 0.0005, + "loss": 1.0363, + "step": 508 + }, + { + "epoch": 0.40654952076677314, + "grad_norm": 0.28239160776138306, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 509 + }, + { + "epoch": 0.4073482428115016, + "grad_norm": 0.28484129905700684, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 510 + }, + { + "epoch": 0.40814696485623003, + "grad_norm": 0.28111377358436584, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 511 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.25087496638298035, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 512 + }, + { + "epoch": 0.4097444089456869, + "grad_norm": 0.1652008444070816, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 513 + }, + { + "epoch": 0.41054313099041534, + "grad_norm": 0.11345700174570084, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 514 + }, + { + "epoch": 0.41134185303514376, + "grad_norm": 0.1191159337759018, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 515 + }, + { + "epoch": 0.41214057507987223, + "grad_norm": 0.26302817463874817, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 516 + }, + { + "epoch": 0.41293929712460065, + "grad_norm": 0.3303217589855194, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 517 + }, + { + "epoch": 0.41373801916932906, + "grad_norm": 0.2874647378921509, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 518 + }, + { + "epoch": 0.4145367412140575, + "grad_norm": 0.23112182319164276, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 519 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 0.16285021603107452, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 520 + }, + { + "epoch": 0.41613418530351437, + "grad_norm": 0.08440099656581879, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 521 + }, + { + "epoch": 0.4169329073482428, + "grad_norm": 0.03578028455376625, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 522 + }, + { + "epoch": 0.41773162939297126, + "grad_norm": 0.0995275005698204, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 523 + }, + { + "epoch": 0.4185303514376997, + "grad_norm": 0.17713160812854767, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 524 + }, + { + "epoch": 0.4193290734824281, + "grad_norm": 0.1685509830713272, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 525 + }, + { + "epoch": 0.42012779552715657, + "grad_norm": 0.11357919126749039, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 526 + }, + { + "epoch": 0.420926517571885, + "grad_norm": 0.059025365859270096, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 527 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.05128806456923485, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 528 + }, + { + "epoch": 0.4225239616613419, + "grad_norm": 0.05291247367858887, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 529 + }, + { + "epoch": 0.4233226837060703, + "grad_norm": 0.10755500197410583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 530 + }, + { + "epoch": 0.4241214057507987, + "grad_norm": 0.15659615397453308, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 531 + }, + { + "epoch": 0.4249201277955272, + "grad_norm": 0.19369953870773315, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 532 + }, + { + "epoch": 0.4257188498402556, + "grad_norm": 0.16491396725177765, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 533 + }, + { + "epoch": 0.426517571884984, + "grad_norm": 0.10276799649000168, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 534 + }, + { + "epoch": 0.4273162939297125, + "grad_norm": 0.06273368000984192, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 535 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 0.03896406292915344, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 536 + }, + { + "epoch": 0.4289137380191693, + "grad_norm": 0.08083273470401764, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 537 + }, + { + "epoch": 0.42971246006389774, + "grad_norm": 0.05107828602194786, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 538 + }, + { + "epoch": 0.4305111821086262, + "grad_norm": 0.04359392821788788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 539 + }, + { + "epoch": 0.43130990415335463, + "grad_norm": 0.04225402697920799, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 540 + }, + { + "epoch": 0.43210862619808305, + "grad_norm": 0.07523404061794281, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 541 + }, + { + "epoch": 0.4329073482428115, + "grad_norm": 0.07966417819261551, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 542 + }, + { + "epoch": 0.43370607028753994, + "grad_norm": 0.04529299959540367, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 543 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.0793156549334526, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 544 + }, + { + "epoch": 0.4353035143769968, + "grad_norm": 0.1533992737531662, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 545 + }, + { + "epoch": 0.43610223642172524, + "grad_norm": 0.2893797755241394, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 546 + }, + { + "epoch": 0.43690095846645366, + "grad_norm": 0.4145842492580414, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 547 + }, + { + "epoch": 0.43769968051118213, + "grad_norm": 0.4550987482070923, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 548 + }, + { + "epoch": 0.43849840255591055, + "grad_norm": 0.4318651556968689, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 549 + }, + { + "epoch": 0.43929712460063897, + "grad_norm": 0.35961681604385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 550 + }, + { + "epoch": 0.44009584664536744, + "grad_norm": 0.18606753647327423, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 551 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 0.12992478907108307, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 552 + }, + { + "epoch": 0.4416932907348243, + "grad_norm": 0.32936930656433105, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 553 + }, + { + "epoch": 0.4424920127795527, + "grad_norm": 0.3547491133213043, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 554 + }, + { + "epoch": 0.44329073482428116, + "grad_norm": 0.2144627720117569, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 555 + }, + { + "epoch": 0.4440894568690096, + "grad_norm": 0.07260395586490631, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 556 + }, + { + "epoch": 0.444888178913738, + "grad_norm": 0.19895662367343903, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 557 + }, + { + "epoch": 0.44568690095846647, + "grad_norm": 0.18664990365505219, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 558 + }, + { + "epoch": 0.4464856230031949, + "grad_norm": 0.11666610836982727, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 559 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.11163592338562012, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 560 + }, + { + "epoch": 0.4480830670926518, + "grad_norm": 0.1815878301858902, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 561 + }, + { + "epoch": 0.4488817891373802, + "grad_norm": 0.2593924105167389, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 562 + }, + { + "epoch": 0.4496805111821086, + "grad_norm": 0.20761220157146454, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 563 + }, + { + "epoch": 0.4504792332268371, + "grad_norm": 0.06589766591787338, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 564 + }, + { + "epoch": 0.4512779552715655, + "grad_norm": 0.21619920432567596, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 565 + }, + { + "epoch": 0.4520766773162939, + "grad_norm": 0.2392708659172058, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 566 + }, + { + "epoch": 0.4528753993610224, + "grad_norm": 0.23214633762836456, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 567 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 0.263883501291275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 568 + }, + { + "epoch": 0.4544728434504792, + "grad_norm": 0.19914190471172333, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 569 + }, + { + "epoch": 0.45527156549520764, + "grad_norm": 0.11453433334827423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 570 + }, + { + "epoch": 0.4560702875399361, + "grad_norm": 0.15091221034526825, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 571 + }, + { + "epoch": 0.45686900958466453, + "grad_norm": 0.043582383543252945, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 572 + }, + { + "epoch": 0.45766773162939295, + "grad_norm": 0.14068740606307983, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 573 + }, + { + "epoch": 0.4584664536741214, + "grad_norm": 0.1274290233850479, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 574 + }, + { + "epoch": 0.45926517571884984, + "grad_norm": 0.13504599034786224, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 575 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.1267779916524887, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 576 + }, + { + "epoch": 0.46086261980830673, + "grad_norm": 0.08138085901737213, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 577 + }, + { + "epoch": 0.46166134185303515, + "grad_norm": 0.07772356271743774, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 578 + }, + { + "epoch": 0.46246006389776356, + "grad_norm": 0.06863631308078766, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 579 + }, + { + "epoch": 0.46325878594249204, + "grad_norm": 0.1232575923204422, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 580 + }, + { + "epoch": 0.46405750798722045, + "grad_norm": 0.179134801030159, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 581 + }, + { + "epoch": 0.46485623003194887, + "grad_norm": 0.20545582473278046, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 582 + }, + { + "epoch": 0.46565495207667734, + "grad_norm": 0.14182575047016144, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 583 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 0.05813328176736832, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 584 + }, + { + "epoch": 0.4672523961661342, + "grad_norm": 0.1530984789133072, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 585 + }, + { + "epoch": 0.4680511182108626, + "grad_norm": 0.2820036709308624, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 586 + }, + { + "epoch": 0.46884984025559107, + "grad_norm": 0.39252954721450806, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 587 + }, + { + "epoch": 0.4696485623003195, + "grad_norm": 0.40830549597740173, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 588 + }, + { + "epoch": 0.4704472843450479, + "grad_norm": 0.2846182882785797, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 589 + }, + { + "epoch": 0.4712460063897764, + "grad_norm": 0.06798163801431656, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 590 + }, + { + "epoch": 0.4720447284345048, + "grad_norm": 0.18650950491428375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 591 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.2965260446071625, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 592 + }, + { + "epoch": 0.4736421725239617, + "grad_norm": 0.24504852294921875, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 593 + }, + { + "epoch": 0.4744408945686901, + "grad_norm": 0.11336984485387802, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 594 + }, + { + "epoch": 0.4752396166134185, + "grad_norm": 0.09007567912340164, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 595 + }, + { + "epoch": 0.476038338658147, + "grad_norm": 0.225834459066391, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 596 + }, + { + "epoch": 0.4768370607028754, + "grad_norm": 0.2679842710494995, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 597 + }, + { + "epoch": 0.4776357827476038, + "grad_norm": 0.1801901012659073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 598 + }, + { + "epoch": 0.4784345047923323, + "grad_norm": 0.09554167836904526, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 599 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 0.046632468700408936, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 600 + }, + { + "epoch": 0.48003194888178913, + "grad_norm": 0.12078758329153061, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 601 + }, + { + "epoch": 0.48083067092651754, + "grad_norm": 0.12126865237951279, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 602 + }, + { + "epoch": 0.481629392971246, + "grad_norm": 0.14078640937805176, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 603 + }, + { + "epoch": 0.48242811501597443, + "grad_norm": 0.18556037545204163, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 604 + }, + { + "epoch": 0.48322683706070285, + "grad_norm": 0.178151473402977, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 605 + }, + { + "epoch": 0.4840255591054313, + "grad_norm": 0.1672516018152237, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 606 + }, + { + "epoch": 0.48482428115015974, + "grad_norm": 0.11648737639188766, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 607 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.11820051819086075, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 608 + }, + { + "epoch": 0.48642172523961663, + "grad_norm": 0.21110932528972626, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 609 + }, + { + "epoch": 0.48722044728434505, + "grad_norm": 0.24852754175662994, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 610 + }, + { + "epoch": 0.48801916932907347, + "grad_norm": 0.2633175551891327, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 611 + }, + { + "epoch": 0.48881789137380194, + "grad_norm": 0.21904303133487701, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 612 + }, + { + "epoch": 0.48961661341853036, + "grad_norm": 0.07822466641664505, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 613 + }, + { + "epoch": 0.4904153354632588, + "grad_norm": 0.0767827108502388, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 614 + }, + { + "epoch": 0.49121405750798725, + "grad_norm": 0.07943699508905411, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 615 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 0.055741772055625916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 616 + }, + { + "epoch": 0.4928115015974441, + "grad_norm": 0.10400068014860153, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 617 + }, + { + "epoch": 0.4936102236421725, + "grad_norm": 0.05080602690577507, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 618 + }, + { + "epoch": 0.49440894568690097, + "grad_norm": 0.07927533984184265, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 619 + }, + { + "epoch": 0.4952076677316294, + "grad_norm": 0.07919944822788239, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 620 + }, + { + "epoch": 0.4960063897763578, + "grad_norm": 0.11013699322938919, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 621 + }, + { + "epoch": 0.4968051118210863, + "grad_norm": 0.16232389211654663, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 622 + }, + { + "epoch": 0.4976038338658147, + "grad_norm": 0.17625346779823303, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 623 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.1681327521800995, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 624 + }, + { + "epoch": 0.4992012779552716, + "grad_norm": 0.1882159262895584, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 625 + }, + { + "epoch": 0.5, + "grad_norm": 0.21075129508972168, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 626 + }, + { + "epoch": 0.5007987220447284, + "grad_norm": 0.1464296281337738, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 627 + }, + { + "epoch": 0.5015974440894568, + "grad_norm": 0.11155212670564651, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 628 + }, + { + "epoch": 0.5023961661341853, + "grad_norm": 0.09794416278600693, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 629 + }, + { + "epoch": 0.5031948881789138, + "grad_norm": 0.12095183879137039, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 630 + }, + { + "epoch": 0.5039936102236422, + "grad_norm": 0.1933794617652893, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 631 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 0.32272887229919434, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 632 + }, + { + "epoch": 0.505591054313099, + "grad_norm": 0.2507671117782593, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 633 + }, + { + "epoch": 0.5063897763578274, + "grad_norm": 0.09540661424398422, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 634 + }, + { + "epoch": 0.5071884984025559, + "grad_norm": 0.07341819256544113, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 635 + }, + { + "epoch": 0.5079872204472844, + "grad_norm": 0.11610874533653259, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 636 + }, + { + "epoch": 0.5087859424920128, + "grad_norm": 0.1338607519865036, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 637 + }, + { + "epoch": 0.5095846645367412, + "grad_norm": 0.07892445474863052, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 638 + }, + { + "epoch": 0.5103833865814696, + "grad_norm": 0.053661834448575974, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 639 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.06852453202009201, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 640 + }, + { + "epoch": 0.5119808306709265, + "grad_norm": 0.045109208673238754, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 641 + }, + { + "epoch": 0.512779552715655, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 642 + }, + { + "epoch": 0.5135782747603834, + "grad_norm": 0.05903350189328194, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 643 + }, + { + "epoch": 0.5143769968051118, + "grad_norm": 0.07314767688512802, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 644 + }, + { + "epoch": 0.5151757188498403, + "grad_norm": 0.12484236806631088, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 645 + }, + { + "epoch": 0.5159744408945687, + "grad_norm": 0.15683352947235107, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 646 + }, + { + "epoch": 0.5167731629392971, + "grad_norm": 0.13519413769245148, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 647 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 0.10333485156297684, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 648 + }, + { + "epoch": 0.518370607028754, + "grad_norm": 0.09626923501491547, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 649 + }, + { + "epoch": 0.5191693290734825, + "grad_norm": 0.08177447319030762, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 650 + }, + { + "epoch": 0.5199680511182109, + "grad_norm": 0.04186684265732765, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 651 + }, + { + "epoch": 0.5207667731629393, + "grad_norm": 0.07705547660589218, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 652 + }, + { + "epoch": 0.5215654952076677, + "grad_norm": 0.05885700136423111, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 653 + }, + { + "epoch": 0.5223642172523961, + "grad_norm": 0.14140211045742035, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 654 + }, + { + "epoch": 0.5231629392971247, + "grad_norm": 0.18797138333320618, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 655 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.2301982045173645, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 656 + }, + { + "epoch": 0.5247603833865815, + "grad_norm": 0.2813114523887634, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 657 + }, + { + "epoch": 0.5255591054313099, + "grad_norm": 0.3205592930316925, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 658 + }, + { + "epoch": 0.5263578274760383, + "grad_norm": 0.3426150381565094, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 659 + }, + { + "epoch": 0.5271565495207667, + "grad_norm": 0.2636663615703583, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 660 + }, + { + "epoch": 0.5279552715654952, + "grad_norm": 0.14799079298973083, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 661 + }, + { + "epoch": 0.5287539936102237, + "grad_norm": 0.06354992836713791, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 662 + }, + { + "epoch": 0.5295527156549521, + "grad_norm": 0.239300936460495, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 663 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 0.33535388112068176, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 664 + }, + { + "epoch": 0.5311501597444089, + "grad_norm": 0.32471078634262085, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 665 + }, + { + "epoch": 0.5319488817891374, + "grad_norm": 0.2491266429424286, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 666 + }, + { + "epoch": 0.5327476038338658, + "grad_norm": 0.09841614216566086, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 667 + }, + { + "epoch": 0.5335463258785943, + "grad_norm": 0.1310579627752304, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 668 + }, + { + "epoch": 0.5343450479233227, + "grad_norm": 0.28287971019744873, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 669 + }, + { + "epoch": 0.5351437699680511, + "grad_norm": 0.3457719385623932, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 670 + }, + { + "epoch": 0.5359424920127795, + "grad_norm": 0.31690946221351624, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 671 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.19356760382652283, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 672 + }, + { + "epoch": 0.5375399361022364, + "grad_norm": 0.05940595269203186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 673 + }, + { + "epoch": 0.5383386581469649, + "grad_norm": 0.20772181451320648, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 674 + }, + { + "epoch": 0.5391373801916933, + "grad_norm": 0.3093980848789215, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 675 + }, + { + "epoch": 0.5399361022364217, + "grad_norm": 0.2632107734680176, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 676 + }, + { + "epoch": 0.5407348242811502, + "grad_norm": 0.12365782260894775, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 677 + }, + { + "epoch": 0.5415335463258786, + "grad_norm": 0.07215466350317001, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 678 + }, + { + "epoch": 0.542332268370607, + "grad_norm": 0.16745947301387787, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 679 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 0.14418186247348785, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 680 + }, + { + "epoch": 0.5439297124600639, + "grad_norm": 0.048094023019075394, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 681 + }, + { + "epoch": 0.5447284345047924, + "grad_norm": 0.10100048035383224, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 682 + }, + { + "epoch": 0.5455271565495208, + "grad_norm": 0.13719545304775238, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 683 + }, + { + "epoch": 0.5463258785942492, + "grad_norm": 0.16066808998584747, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 684 + }, + { + "epoch": 0.5471246006389776, + "grad_norm": 0.19201414287090302, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 685 + }, + { + "epoch": 0.547923322683706, + "grad_norm": 0.19783100485801697, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 686 + }, + { + "epoch": 0.5487220447284346, + "grad_norm": 0.1431797295808792, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 687 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.04368956387042999, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 688 + }, + { + "epoch": 0.5503194888178914, + "grad_norm": 0.12395253777503967, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 689 + }, + { + "epoch": 0.5511182108626198, + "grad_norm": 0.16278770565986633, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 690 + }, + { + "epoch": 0.5519169329073482, + "grad_norm": 0.15368889272212982, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 691 + }, + { + "epoch": 0.5527156549520766, + "grad_norm": 0.10195931792259216, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 692 + }, + { + "epoch": 0.5535143769968051, + "grad_norm": 0.03421236202120781, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 693 + }, + { + "epoch": 0.5543130990415336, + "grad_norm": 0.09549148380756378, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 694 + }, + { + "epoch": 0.555111821086262, + "grad_norm": 0.17825989425182343, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 695 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 0.25296247005462646, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 696 + }, + { + "epoch": 0.5567092651757188, + "grad_norm": 0.27566400170326233, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 697 + }, + { + "epoch": 0.5575079872204473, + "grad_norm": 0.22609780728816986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 698 + }, + { + "epoch": 0.5583067092651757, + "grad_norm": 0.10555832833051682, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 699 + }, + { + "epoch": 0.5591054313099042, + "grad_norm": 0.1309640258550644, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 700 + }, + { + "epoch": 0.5599041533546326, + "grad_norm": 0.3434476852416992, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 701 + }, + { + "epoch": 0.560702875399361, + "grad_norm": 0.4559882581233978, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 702 + }, + { + "epoch": 0.5615015974440895, + "grad_norm": 0.390683650970459, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 703 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.14178164303302765, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 704 + }, + { + "epoch": 0.5630990415335463, + "grad_norm": 0.19113974273204803, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 705 + }, + { + "epoch": 0.5638977635782748, + "grad_norm": 0.38376086950302124, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 706 + }, + { + "epoch": 0.5646964856230032, + "grad_norm": 0.3486707806587219, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 707 + }, + { + "epoch": 0.5654952076677316, + "grad_norm": 0.14712302386760712, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 708 + }, + { + "epoch": 0.5662939297124601, + "grad_norm": 0.11827494204044342, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 709 + }, + { + "epoch": 0.5670926517571885, + "grad_norm": 0.27573689818382263, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 710 + }, + { + "epoch": 0.5678913738019169, + "grad_norm": 0.2983379065990448, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 711 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 0.2019582986831665, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 712 + }, + { + "epoch": 0.5694888178913738, + "grad_norm": 0.04186725243926048, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 713 + }, + { + "epoch": 0.5702875399361023, + "grad_norm": 0.16714231669902802, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 714 + }, + { + "epoch": 0.5710862619808307, + "grad_norm": 0.24982011318206787, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 715 + }, + { + "epoch": 0.5718849840255591, + "grad_norm": 0.22021397948265076, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 716 + }, + { + "epoch": 0.5726837060702875, + "grad_norm": 0.09717470407485962, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 717 + }, + { + "epoch": 0.5734824281150159, + "grad_norm": 0.10214962065219879, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 718 + }, + { + "epoch": 0.5742811501597445, + "grad_norm": 0.15325960516929626, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 719 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.11207877099514008, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 720 + }, + { + "epoch": 0.5758785942492013, + "grad_norm": 0.05425047129392624, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 721 + }, + { + "epoch": 0.5766773162939297, + "grad_norm": 0.0703732892870903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 722 + }, + { + "epoch": 0.5774760383386581, + "grad_norm": 0.10577918589115143, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 723 + }, + { + "epoch": 0.5782747603833865, + "grad_norm": 0.13230514526367188, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 724 + }, + { + "epoch": 0.579073482428115, + "grad_norm": 0.1878778040409088, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 725 + }, + { + "epoch": 0.5798722044728435, + "grad_norm": 0.19956567883491516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 726 + }, + { + "epoch": 0.5806709265175719, + "grad_norm": 0.13732020556926727, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 727 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 0.09844338148832321, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 728 + }, + { + "epoch": 0.5822683706070287, + "grad_norm": 0.056577637791633606, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 729 + }, + { + "epoch": 0.5830670926517572, + "grad_norm": 0.0835585743188858, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 730 + }, + { + "epoch": 0.5838658146964856, + "grad_norm": 0.0910082757472992, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 731 + }, + { + "epoch": 0.5846645367412141, + "grad_norm": 0.0659257099032402, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 732 + }, + { + "epoch": 0.5854632587859425, + "grad_norm": 0.09342535585165024, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 733 + }, + { + "epoch": 0.5862619808306709, + "grad_norm": 0.0627603679895401, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 734 + }, + { + "epoch": 0.5870607028753994, + "grad_norm": 0.10535050183534622, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 735 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.13628117740154266, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 736 + }, + { + "epoch": 0.5886581469648562, + "grad_norm": 0.0715300589799881, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 737 + }, + { + "epoch": 0.5894568690095847, + "grad_norm": 0.10892884433269501, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 738 + }, + { + "epoch": 0.5902555910543131, + "grad_norm": 0.09805259853601456, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 739 + }, + { + "epoch": 0.5910543130990416, + "grad_norm": 0.14491751790046692, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 740 + }, + { + "epoch": 0.59185303514377, + "grad_norm": 0.15448585152626038, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 741 + }, + { + "epoch": 0.5926517571884984, + "grad_norm": 0.08218494802713394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 742 + }, + { + "epoch": 0.5934504792332268, + "grad_norm": 0.16311237215995789, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 743 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 0.10310494899749756, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 744 + }, + { + "epoch": 0.5950479233226837, + "grad_norm": 0.1511978805065155, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 745 + }, + { + "epoch": 0.5958466453674122, + "grad_norm": 0.20440778136253357, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 746 + }, + { + "epoch": 0.5966453674121406, + "grad_norm": 0.20918506383895874, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 747 + }, + { + "epoch": 0.597444089456869, + "grad_norm": 0.20070627331733704, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 748 + }, + { + "epoch": 0.5982428115015974, + "grad_norm": 0.1142180860042572, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 749 + }, + { + "epoch": 0.5990415335463258, + "grad_norm": 0.09418357163667679, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 750 + }, + { + "epoch": 0.5998402555910544, + "grad_norm": 0.24306562542915344, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 751 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.3208121955394745, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 752 + }, + { + "epoch": 0.6014376996805112, + "grad_norm": 0.3070276081562042, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 753 + }, + { + "epoch": 0.6022364217252396, + "grad_norm": 0.17130877077579498, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 754 + }, + { + "epoch": 0.603035143769968, + "grad_norm": 0.0733935534954071, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 755 + }, + { + "epoch": 0.6038338658146964, + "grad_norm": 0.25525134801864624, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 756 + }, + { + "epoch": 0.604632587859425, + "grad_norm": 0.39397957921028137, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 757 + }, + { + "epoch": 0.6054313099041534, + "grad_norm": 0.39015471935272217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 758 + }, + { + "epoch": 0.6062300319488818, + "grad_norm": 0.1757609099149704, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 759 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 0.19901637732982635, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 760 + }, + { + "epoch": 0.6078274760383386, + "grad_norm": 0.46885979175567627, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 761 + }, + { + "epoch": 0.6086261980830671, + "grad_norm": 0.4650067687034607, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 762 + }, + { + "epoch": 0.6094249201277955, + "grad_norm": 0.16624194383621216, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 763 + }, + { + "epoch": 0.610223642172524, + "grad_norm": 0.23347698152065277, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 764 + }, + { + "epoch": 0.6110223642172524, + "grad_norm": 0.40192991495132446, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 765 + }, + { + "epoch": 0.6118210862619808, + "grad_norm": 0.33640867471694946, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 766 + }, + { + "epoch": 0.6126198083067093, + "grad_norm": 0.11979667842388153, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 767 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.17994286119937897, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 768 + }, + { + "epoch": 0.6142172523961661, + "grad_norm": 0.2693847715854645, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 769 + }, + { + "epoch": 0.6150159744408946, + "grad_norm": 0.2041584849357605, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 770 + }, + { + "epoch": 0.615814696485623, + "grad_norm": 0.052040908485651016, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 771 + }, + { + "epoch": 0.6166134185303515, + "grad_norm": 0.18652868270874023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 772 + }, + { + "epoch": 0.6174121405750799, + "grad_norm": 0.26122182607650757, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 773 + }, + { + "epoch": 0.6182108626198083, + "grad_norm": 0.15385891497135162, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 774 + }, + { + "epoch": 0.6190095846645367, + "grad_norm": 0.09217085689306259, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 775 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 0.23316404223442078, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 776 + }, + { + "epoch": 0.6206070287539937, + "grad_norm": 0.24094274640083313, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 777 + }, + { + "epoch": 0.6214057507987221, + "grad_norm": 0.08518059551715851, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 778 + }, + { + "epoch": 0.6222044728434505, + "grad_norm": 0.11076594144105911, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 779 + }, + { + "epoch": 0.6230031948881789, + "grad_norm": 0.1963978409767151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 780 + }, + { + "epoch": 0.6238019169329073, + "grad_norm": 0.1526973396539688, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 781 + }, + { + "epoch": 0.6246006389776357, + "grad_norm": 0.09434971958398819, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 782 + }, + { + "epoch": 0.6253993610223643, + "grad_norm": 0.2677021622657776, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 783 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.2885434329509735, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 784 + }, + { + "epoch": 0.6269968051118211, + "grad_norm": 0.14111816883087158, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 785 + }, + { + "epoch": 0.6277955271565495, + "grad_norm": 0.06594719737768173, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 786 + }, + { + "epoch": 0.6285942492012779, + "grad_norm": 0.09837283194065094, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 787 + }, + { + "epoch": 0.6293929712460063, + "grad_norm": 0.06089933589100838, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 788 + }, + { + "epoch": 0.6301916932907349, + "grad_norm": 0.16248181462287903, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 789 + }, + { + "epoch": 0.6309904153354633, + "grad_norm": 0.298454612493515, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 790 + }, + { + "epoch": 0.6317891373801917, + "grad_norm": 0.3365437090396881, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 791 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 0.22858452796936035, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 792 + }, + { + "epoch": 0.6333865814696485, + "grad_norm": 0.04849984869360924, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 793 + }, + { + "epoch": 0.634185303514377, + "grad_norm": 0.24791331589221954, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 794 + }, + { + "epoch": 0.6349840255591054, + "grad_norm": 0.3028055727481842, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 795 + }, + { + "epoch": 0.6357827476038339, + "grad_norm": 0.15674540400505066, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 796 + }, + { + "epoch": 0.6365814696485623, + "grad_norm": 0.08521793782711029, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 797 + }, + { + "epoch": 0.6373801916932907, + "grad_norm": 0.21750952303409576, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 798 + }, + { + "epoch": 0.6381789137380192, + "grad_norm": 0.18880338966846466, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 799 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.06699419766664505, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 800 + }, + { + "epoch": 0.639776357827476, + "grad_norm": 0.08062998205423355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 801 + }, + { + "epoch": 0.6405750798722045, + "grad_norm": 0.10635658353567123, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 802 + }, + { + "epoch": 0.6413738019169329, + "grad_norm": 0.05086763948202133, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 803 + }, + { + "epoch": 0.6421725239616614, + "grad_norm": 0.09852107614278793, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 804 + }, + { + "epoch": 0.6429712460063898, + "grad_norm": 0.11290771514177322, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 805 + }, + { + "epoch": 0.6437699680511182, + "grad_norm": 0.15106825530529022, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 806 + }, + { + "epoch": 0.6445686900958466, + "grad_norm": 0.13646326959133148, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 807 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 0.06398668140172958, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 808 + }, + { + "epoch": 0.6461661341853036, + "grad_norm": 0.11581127345561981, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 809 + }, + { + "epoch": 0.646964856230032, + "grad_norm": 0.15684139728546143, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 810 + }, + { + "epoch": 0.6477635782747604, + "grad_norm": 0.14094121754169464, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 811 + }, + { + "epoch": 0.6485623003194888, + "grad_norm": 0.0938766822218895, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 812 + }, + { + "epoch": 0.6493610223642172, + "grad_norm": 0.06041521951556206, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 813 + }, + { + "epoch": 0.6501597444089456, + "grad_norm": 0.13364291191101074, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 814 + }, + { + "epoch": 0.6509584664536742, + "grad_norm": 0.15577054023742676, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 815 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.1119854673743248, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 816 + }, + { + "epoch": 0.652555910543131, + "grad_norm": 0.07751357555389404, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 817 + }, + { + "epoch": 0.6533546325878594, + "grad_norm": 0.10110143572092056, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 818 + }, + { + "epoch": 0.6541533546325878, + "grad_norm": 0.19627511501312256, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 819 + }, + { + "epoch": 0.6549520766773163, + "grad_norm": 0.19837769865989685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 820 + }, + { + "epoch": 0.6557507987220448, + "grad_norm": 0.13598690927028656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 821 + }, + { + "epoch": 0.6565495207667732, + "grad_norm": 0.05950666591525078, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 822 + }, + { + "epoch": 0.6573482428115016, + "grad_norm": 0.060314662754535675, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 823 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 0.11455138027667999, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 824 + }, + { + "epoch": 0.6589456869009584, + "grad_norm": 0.16753345727920532, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 825 + }, + { + "epoch": 0.6597444089456869, + "grad_norm": 0.15707428753376007, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 826 + }, + { + "epoch": 0.6605431309904153, + "grad_norm": 0.07224153727293015, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 827 + }, + { + "epoch": 0.6613418530351438, + "grad_norm": 0.10538042336702347, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 828 + }, + { + "epoch": 0.6621405750798722, + "grad_norm": 0.18855130672454834, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 829 + }, + { + "epoch": 0.6629392971246006, + "grad_norm": 0.17752179503440857, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 830 + }, + { + "epoch": 0.6637380191693291, + "grad_norm": 0.10109171271324158, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 831 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.15006190538406372, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 832 + }, + { + "epoch": 0.6653354632587859, + "grad_norm": 0.2701014578342438, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 833 + }, + { + "epoch": 0.6661341853035144, + "grad_norm": 0.2607312500476837, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 834 + }, + { + "epoch": 0.6669329073482428, + "grad_norm": 0.19712841510772705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 835 + }, + { + "epoch": 0.6677316293929713, + "grad_norm": 0.0839366614818573, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 836 + }, + { + "epoch": 0.6685303514376997, + "grad_norm": 0.1595088541507721, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 837 + }, + { + "epoch": 0.6693290734824281, + "grad_norm": 0.2773466408252716, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 838 + }, + { + "epoch": 0.6701277955271565, + "grad_norm": 0.24616314470767975, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 839 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 0.15596427023410797, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 840 + }, + { + "epoch": 0.6717252396166135, + "grad_norm": 0.047822993248701096, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 841 + }, + { + "epoch": 0.6725239616613419, + "grad_norm": 0.17692670226097107, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 842 + }, + { + "epoch": 0.6733226837060703, + "grad_norm": 0.1742856502532959, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 843 + }, + { + "epoch": 0.6741214057507987, + "grad_norm": 0.15347127616405487, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 844 + }, + { + "epoch": 0.6749201277955271, + "grad_norm": 0.18238374590873718, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 845 + }, + { + "epoch": 0.6757188498402555, + "grad_norm": 0.1524323672056198, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 846 + }, + { + "epoch": 0.6765175718849841, + "grad_norm": 0.1820068210363388, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 847 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.2010941058397293, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 848 + }, + { + "epoch": 0.6781150159744409, + "grad_norm": 0.16428111493587494, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 849 + }, + { + "epoch": 0.6789137380191693, + "grad_norm": 0.1538572460412979, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 850 + }, + { + "epoch": 0.6797124600638977, + "grad_norm": 0.057427916675806046, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 851 + }, + { + "epoch": 0.6805111821086262, + "grad_norm": 0.08329081535339355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 852 + }, + { + "epoch": 0.6813099041533547, + "grad_norm": 0.05685174837708473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 853 + }, + { + "epoch": 0.6821086261980831, + "grad_norm": 0.15277032554149628, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 854 + }, + { + "epoch": 0.6829073482428115, + "grad_norm": 0.24243640899658203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 855 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 0.28722453117370605, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 856 + }, + { + "epoch": 0.6845047923322684, + "grad_norm": 0.1997309774160385, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 857 + }, + { + "epoch": 0.6853035143769968, + "grad_norm": 0.061719026416540146, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 858 + }, + { + "epoch": 0.6861022364217252, + "grad_norm": 0.23425672948360443, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 859 + }, + { + "epoch": 0.6869009584664537, + "grad_norm": 0.350109726190567, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 860 + }, + { + "epoch": 0.6876996805111821, + "grad_norm": 0.34444838762283325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 861 + }, + { + "epoch": 0.6884984025559105, + "grad_norm": 0.15325413644313812, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 862 + }, + { + "epoch": 0.689297124600639, + "grad_norm": 0.1227702870965004, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 863 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 0.24337291717529297, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 864 + }, + { + "epoch": 0.6908945686900958, + "grad_norm": 0.24047589302062988, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 865 + }, + { + "epoch": 0.6916932907348243, + "grad_norm": 0.13576050102710724, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 866 + }, + { + "epoch": 0.6924920127795527, + "grad_norm": 0.0503714494407177, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 867 + }, + { + "epoch": 0.6932907348242812, + "grad_norm": 0.1292860060930252, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 868 + }, + { + "epoch": 0.6940894568690096, + "grad_norm": 0.14698486030101776, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 869 + }, + { + "epoch": 0.694888178913738, + "grad_norm": 0.07720573991537094, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 870 + }, + { + "epoch": 0.6956869009584664, + "grad_norm": 0.1604471504688263, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 871 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 0.32734861969947815, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 872 + }, + { + "epoch": 0.6972843450479234, + "grad_norm": 0.32366684079170227, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 873 + }, + { + "epoch": 0.6980830670926518, + "grad_norm": 0.18428802490234375, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 874 + }, + { + "epoch": 0.6988817891373802, + "grad_norm": 0.07498858869075775, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 875 + }, + { + "epoch": 0.6996805111821086, + "grad_norm": 0.24449816346168518, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 876 + }, + { + "epoch": 0.700479233226837, + "grad_norm": 0.26649829745292664, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 877 + }, + { + "epoch": 0.7012779552715654, + "grad_norm": 0.1315024197101593, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 878 + }, + { + "epoch": 0.702076677316294, + "grad_norm": 0.10907325148582458, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 879 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.2364589273929596, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 880 + }, + { + "epoch": 0.7036741214057508, + "grad_norm": 0.1663885861635208, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 881 + }, + { + "epoch": 0.7044728434504792, + "grad_norm": 0.0596470907330513, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 882 + }, + { + "epoch": 0.7052715654952076, + "grad_norm": 0.1519233137369156, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 883 + }, + { + "epoch": 0.7060702875399361, + "grad_norm": 0.23089520633220673, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 884 + }, + { + "epoch": 0.7068690095846646, + "grad_norm": 0.20667214691638947, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 885 + }, + { + "epoch": 0.707667731629393, + "grad_norm": 0.10739922523498535, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 886 + }, + { + "epoch": 0.7084664536741214, + "grad_norm": 0.04334057494997978, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 887 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 0.15619881451129913, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 888 + }, + { + "epoch": 0.7100638977635783, + "grad_norm": 0.26618269085884094, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 889 + }, + { + "epoch": 0.7108626198083067, + "grad_norm": 0.1834406554698944, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 890 + }, + { + "epoch": 0.7116613418530351, + "grad_norm": 0.08332087099552155, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 891 + }, + { + "epoch": 0.7124600638977636, + "grad_norm": 0.23721523582935333, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 892 + }, + { + "epoch": 0.713258785942492, + "grad_norm": 0.2912815809249878, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 893 + }, + { + "epoch": 0.7140575079872205, + "grad_norm": 0.25534820556640625, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 894 + }, + { + "epoch": 0.7148562300319489, + "grad_norm": 0.14200575649738312, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 895 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.08668249845504761, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 896 + }, + { + "epoch": 0.7164536741214057, + "grad_norm": 0.2358543574810028, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 897 + }, + { + "epoch": 0.7172523961661342, + "grad_norm": 0.2729748487472534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 898 + }, + { + "epoch": 0.7180511182108626, + "grad_norm": 0.14862589538097382, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 899 + }, + { + "epoch": 0.7188498402555911, + "grad_norm": 0.14500044286251068, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 900 + }, + { + "epoch": 0.7196485623003195, + "grad_norm": 0.28659892082214355, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 901 + }, + { + "epoch": 0.7204472843450479, + "grad_norm": 0.2974075376987457, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 902 + }, + { + "epoch": 0.7212460063897763, + "grad_norm": 0.07839605212211609, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 903 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 0.2542141079902649, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 904 + }, + { + "epoch": 0.7228434504792333, + "grad_norm": 0.357192724943161, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 905 + }, + { + "epoch": 0.7236421725239617, + "grad_norm": 0.21535371243953705, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 906 + }, + { + "epoch": 0.7244408945686901, + "grad_norm": 0.08053386211395264, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 907 + }, + { + "epoch": 0.7252396166134185, + "grad_norm": 0.22670729458332062, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 908 + }, + { + "epoch": 0.7260383386581469, + "grad_norm": 0.21510791778564453, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 909 + }, + { + "epoch": 0.7268370607028753, + "grad_norm": 0.07556216418743134, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 910 + }, + { + "epoch": 0.7276357827476039, + "grad_norm": 0.08772645890712738, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 911 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.2531013488769531, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 912 + }, + { + "epoch": 0.7292332268370607, + "grad_norm": 0.06658858805894852, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 913 + }, + { + "epoch": 0.7300319488817891, + "grad_norm": 0.09869293123483658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 914 + }, + { + "epoch": 0.7308306709265175, + "grad_norm": 0.17758162319660187, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 915 + }, + { + "epoch": 0.731629392971246, + "grad_norm": 0.16267521679401398, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 916 + }, + { + "epoch": 0.7324281150159745, + "grad_norm": 0.09948690980672836, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 917 + }, + { + "epoch": 0.7332268370607029, + "grad_norm": 0.05900302529335022, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 918 + }, + { + "epoch": 0.7340255591054313, + "grad_norm": 0.08200150728225708, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 919 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 0.09217624366283417, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 920 + }, + { + "epoch": 0.7356230031948882, + "grad_norm": 0.12414196133613586, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 921 + }, + { + "epoch": 0.7364217252396166, + "grad_norm": 0.131890669465065, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 922 + }, + { + "epoch": 0.737220447284345, + "grad_norm": 0.1187182292342186, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 923 + }, + { + "epoch": 0.7380191693290735, + "grad_norm": 0.09890205413103104, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 924 + }, + { + "epoch": 0.7388178913738019, + "grad_norm": 0.06730851531028748, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 925 + }, + { + "epoch": 0.7396166134185304, + "grad_norm": 0.038627006113529205, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 926 + }, + { + "epoch": 0.7404153354632588, + "grad_norm": 0.07148899137973785, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 927 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 0.05876476690173149, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 928 + }, + { + "epoch": 0.7420127795527156, + "grad_norm": 0.11069595813751221, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 929 + }, + { + "epoch": 0.7428115015974441, + "grad_norm": 0.10409362614154816, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 930 + }, + { + "epoch": 0.7436102236421726, + "grad_norm": 0.08115468919277191, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 931 + }, + { + "epoch": 0.744408945686901, + "grad_norm": 0.14105193316936493, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 932 + }, + { + "epoch": 0.7452076677316294, + "grad_norm": 0.07780246436595917, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 933 + }, + { + "epoch": 0.7460063897763578, + "grad_norm": 0.08895678073167801, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 934 + }, + { + "epoch": 0.7468051118210862, + "grad_norm": 0.10844068974256516, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 935 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 0.07179753482341766, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 936 + }, + { + "epoch": 0.7484025559105432, + "grad_norm": 0.11107192933559418, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 937 + }, + { + "epoch": 0.7492012779552716, + "grad_norm": 0.2845052480697632, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 938 + }, + { + "epoch": 0.75, + "grad_norm": 0.41480058431625366, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 939 + }, + { + "epoch": 0.7507987220447284, + "grad_norm": 0.3101426064968109, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 940 + }, + { + "epoch": 0.7515974440894568, + "grad_norm": 0.09521801024675369, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 941 + }, + { + "epoch": 0.7523961661341853, + "grad_norm": 0.18613341450691223, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 942 + }, + { + "epoch": 0.7531948881789138, + "grad_norm": 0.2665672302246094, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 943 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.20693817734718323, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 944 + }, + { + "epoch": 0.7547923322683706, + "grad_norm": 0.05853262171149254, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 945 + }, + { + "epoch": 0.755591054313099, + "grad_norm": 0.22123664617538452, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 946 + }, + { + "epoch": 0.7563897763578274, + "grad_norm": 0.2845379114151001, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 947 + }, + { + "epoch": 0.7571884984025559, + "grad_norm": 0.20357397198677063, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 948 + }, + { + "epoch": 0.7579872204472844, + "grad_norm": 0.0897352546453476, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 949 + }, + { + "epoch": 0.7587859424920128, + "grad_norm": 0.06572771817445755, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 950 + }, + { + "epoch": 0.7595846645367412, + "grad_norm": 0.09441806375980377, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 951 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 0.06848953664302826, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 952 + }, + { + "epoch": 0.7611821086261981, + "grad_norm": 0.127177432179451, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 953 + }, + { + "epoch": 0.7619808306709265, + "grad_norm": 0.25466713309288025, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 954 + }, + { + "epoch": 0.762779552715655, + "grad_norm": 0.32952556014060974, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 955 + }, + { + "epoch": 0.7635782747603834, + "grad_norm": 0.2976897358894348, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 956 + }, + { + "epoch": 0.7643769968051118, + "grad_norm": 0.17444387078285217, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 957 + }, + { + "epoch": 0.7651757188498403, + "grad_norm": 0.10458981990814209, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 958 + }, + { + "epoch": 0.7659744408945687, + "grad_norm": 0.07028939574956894, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 959 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.1888386309146881, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 960 + }, + { + "epoch": 0.7675718849840255, + "grad_norm": 0.19400012493133545, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 961 + }, + { + "epoch": 0.768370607028754, + "grad_norm": 0.12069790065288544, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 962 + }, + { + "epoch": 0.7691693290734825, + "grad_norm": 0.06206851452589035, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 963 + }, + { + "epoch": 0.7699680511182109, + "grad_norm": 0.07195326685905457, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 964 + }, + { + "epoch": 0.7707667731629393, + "grad_norm": 0.09240477532148361, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 965 + }, + { + "epoch": 0.7715654952076677, + "grad_norm": 0.04433378204703331, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 966 + }, + { + "epoch": 0.7723642172523961, + "grad_norm": 0.07411819696426392, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 967 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 0.11440210789442062, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 968 + }, + { + "epoch": 0.7739616613418531, + "grad_norm": 0.23913118243217468, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 969 + }, + { + "epoch": 0.7747603833865815, + "grad_norm": 0.31028202176094055, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 970 + }, + { + "epoch": 0.7755591054313099, + "grad_norm": 0.3343825936317444, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 971 + }, + { + "epoch": 0.7763578274760383, + "grad_norm": 0.2559935748577118, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 972 + }, + { + "epoch": 0.7771565495207667, + "grad_norm": 0.05685359239578247, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 973 + }, + { + "epoch": 0.7779552715654952, + "grad_norm": 0.1760183721780777, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 974 + }, + { + "epoch": 0.7787539936102237, + "grad_norm": 0.25240832567214966, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 975 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.13724291324615479, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 976 + }, + { + "epoch": 0.7803514376996805, + "grad_norm": 0.11687567830085754, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 977 + }, + { + "epoch": 0.7811501597444089, + "grad_norm": 0.31319329142570496, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 978 + }, + { + "epoch": 0.7819488817891374, + "grad_norm": 0.3297184705734253, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 979 + }, + { + "epoch": 0.7827476038338658, + "grad_norm": 0.19443389773368835, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 980 + }, + { + "epoch": 0.7835463258785943, + "grad_norm": 0.04911043494939804, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 981 + }, + { + "epoch": 0.7843450479233227, + "grad_norm": 0.19837717711925507, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 982 + }, + { + "epoch": 0.7851437699680511, + "grad_norm": 0.23165349662303925, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 983 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 0.12156365066766739, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 984 + }, + { + "epoch": 0.786741214057508, + "grad_norm": 0.1305016428232193, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 985 + }, + { + "epoch": 0.7875399361022364, + "grad_norm": 0.12228422611951828, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 986 + }, + { + "epoch": 0.7883386581469649, + "grad_norm": 0.09014695137739182, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 987 + }, + { + "epoch": 0.7891373801916933, + "grad_norm": 0.060052234679460526, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 988 + }, + { + "epoch": 0.7899361022364217, + "grad_norm": 0.17842933535575867, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 989 + }, + { + "epoch": 0.7907348242811502, + "grad_norm": 0.2823020815849304, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 990 + }, + { + "epoch": 0.7915335463258786, + "grad_norm": 0.2571483254432678, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 991 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.11443623155355453, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 992 + }, + { + "epoch": 0.7931309904153354, + "grad_norm": 0.09048285335302353, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 993 + }, + { + "epoch": 0.7939297124600639, + "grad_norm": 0.1863749772310257, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 994 + }, + { + "epoch": 0.7947284345047924, + "grad_norm": 0.1481461524963379, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 995 + }, + { + "epoch": 0.7955271565495208, + "grad_norm": 0.06870540231466293, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 996 + }, + { + "epoch": 0.7963258785942492, + "grad_norm": 0.04223543405532837, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 997 + }, + { + "epoch": 0.7971246006389776, + "grad_norm": 0.04194851219654083, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 998 + }, + { + "epoch": 0.797923322683706, + "grad_norm": 0.03982497751712799, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 999 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 0.20985758304595947, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1000 + }, + { + "epoch": 0.799520766773163, + "grad_norm": 0.11346526443958282, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1001 + }, + { + "epoch": 0.8003194888178914, + "grad_norm": 0.16594401001930237, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1002 + }, + { + "epoch": 0.8011182108626198, + "grad_norm": 0.1788545846939087, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1003 + }, + { + "epoch": 0.8019169329073482, + "grad_norm": 0.07928512245416641, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1004 + }, + { + "epoch": 0.8027156549520766, + "grad_norm": 0.0953991562128067, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1005 + }, + { + "epoch": 0.8035143769968051, + "grad_norm": 0.2052081823348999, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1006 + }, + { + "epoch": 0.8043130990415336, + "grad_norm": 0.1999465525150299, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1007 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.09821965545415878, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1008 + }, + { + "epoch": 0.8059105431309904, + "grad_norm": 0.0762021467089653, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1009 + }, + { + "epoch": 0.8067092651757188, + "grad_norm": 0.20475991070270538, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1010 + }, + { + "epoch": 0.8075079872204473, + "grad_norm": 0.23028631508350372, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1011 + }, + { + "epoch": 0.8083067092651757, + "grad_norm": 0.12122747302055359, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1012 + }, + { + "epoch": 0.8091054313099042, + "grad_norm": 0.08124672621488571, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1013 + }, + { + "epoch": 0.8099041533546326, + "grad_norm": 0.21313415467739105, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1014 + }, + { + "epoch": 0.810702875399361, + "grad_norm": 0.311813622713089, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1015 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 0.3032541275024414, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1016 + }, + { + "epoch": 0.8123003194888179, + "grad_norm": 0.21727560460567474, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1017 + }, + { + "epoch": 0.8130990415335463, + "grad_norm": 0.0620480477809906, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1018 + }, + { + "epoch": 0.8138977635782748, + "grad_norm": 0.20105740427970886, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1019 + }, + { + "epoch": 0.8146964856230032, + "grad_norm": 0.28996244072914124, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1020 + }, + { + "epoch": 0.8154952076677316, + "grad_norm": 0.22115157544612885, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1021 + }, + { + "epoch": 0.8162939297124601, + "grad_norm": 0.10071029514074326, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1022 + }, + { + "epoch": 0.8170926517571885, + "grad_norm": 0.12363877147436142, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1023 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.29970163106918335, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1024 + }, + { + "epoch": 0.8186900958466453, + "grad_norm": 0.32754749059677124, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1025 + }, + { + "epoch": 0.8194888178913738, + "grad_norm": 0.20028825104236603, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1026 + }, + { + "epoch": 0.8202875399361023, + "grad_norm": 0.08162792772054672, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1027 + }, + { + "epoch": 0.8210862619808307, + "grad_norm": 0.27463749051094055, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1028 + }, + { + "epoch": 0.8218849840255591, + "grad_norm": 0.30335354804992676, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1029 + }, + { + "epoch": 0.8226837060702875, + "grad_norm": 0.12106633186340332, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1030 + }, + { + "epoch": 0.8234824281150159, + "grad_norm": 0.16331955790519714, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1031 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 0.2764187455177307, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1032 + }, + { + "epoch": 0.8250798722044729, + "grad_norm": 0.20136456191539764, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1033 + }, + { + "epoch": 0.8258785942492013, + "grad_norm": 0.06438590586185455, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1034 + }, + { + "epoch": 0.8266773162939297, + "grad_norm": 0.18764367699623108, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1035 + }, + { + "epoch": 0.8274760383386581, + "grad_norm": 0.20327645540237427, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1036 + }, + { + "epoch": 0.8282747603833865, + "grad_norm": 0.08825036138296127, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1037 + }, + { + "epoch": 0.829073482428115, + "grad_norm": 0.11037785559892654, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1038 + }, + { + "epoch": 0.8298722044728435, + "grad_norm": 0.18273280560970306, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1039 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.16820372641086578, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1040 + }, + { + "epoch": 0.8314696485623003, + "grad_norm": 0.06250625103712082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1041 + }, + { + "epoch": 0.8322683706070287, + "grad_norm": 0.12141115218400955, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1042 + }, + { + "epoch": 0.8330670926517572, + "grad_norm": 0.13594450056552887, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1043 + }, + { + "epoch": 0.8338658146964856, + "grad_norm": 0.16069599986076355, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1044 + }, + { + "epoch": 0.8346645367412141, + "grad_norm": 0.11631255596876144, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1045 + }, + { + "epoch": 0.8354632587859425, + "grad_norm": 0.050075192004442215, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1046 + }, + { + "epoch": 0.8362619808306709, + "grad_norm": 0.06317511945962906, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1047 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 0.09078527241945267, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1048 + }, + { + "epoch": 0.8378594249201278, + "grad_norm": 0.1618194878101349, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1049 + }, + { + "epoch": 0.8386581469648562, + "grad_norm": 0.2044777274131775, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1050 + }, + { + "epoch": 0.8394568690095847, + "grad_norm": 0.20439067482948303, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.8402555910543131, + "grad_norm": 0.1967901587486267, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1052 + }, + { + "epoch": 0.8410543130990416, + "grad_norm": 0.06829354166984558, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1053 + }, + { + "epoch": 0.84185303514377, + "grad_norm": 0.12168806046247482, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1054 + }, + { + "epoch": 0.8426517571884984, + "grad_norm": 0.23461978137493134, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1055 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.28916484117507935, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1056 + }, + { + "epoch": 0.8442492012779552, + "grad_norm": 0.21827733516693115, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1057 + }, + { + "epoch": 0.8450479233226837, + "grad_norm": 0.045396093279123306, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1058 + }, + { + "epoch": 0.8458466453674122, + "grad_norm": 0.2391543984413147, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1059 + }, + { + "epoch": 0.8466453674121406, + "grad_norm": 0.2916122078895569, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1060 + }, + { + "epoch": 0.847444089456869, + "grad_norm": 0.1589413434267044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1061 + }, + { + "epoch": 0.8482428115015974, + "grad_norm": 0.14869733154773712, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1062 + }, + { + "epoch": 0.8490415335463258, + "grad_norm": 0.3719956874847412, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 1063 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1064 + }, + { + "epoch": 0.8506389776357828, + "grad_norm": 0.22647641599178314, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1065 + }, + { + "epoch": 0.8514376996805112, + "grad_norm": 0.14329837262630463, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1066 + }, + { + "epoch": 0.8522364217252396, + "grad_norm": 0.2508337199687958, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1067 + }, + { + "epoch": 0.853035143769968, + "grad_norm": 0.16483807563781738, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1068 + }, + { + "epoch": 0.8538338658146964, + "grad_norm": 0.08231265842914581, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1069 + }, + { + "epoch": 0.854632587859425, + "grad_norm": 0.15707719326019287, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1070 + }, + { + "epoch": 0.8554313099041534, + "grad_norm": 0.1741408407688141, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1071 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.06281771510839462, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1072 + }, + { + "epoch": 0.8570287539936102, + "grad_norm": 0.10936494171619415, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1073 + }, + { + "epoch": 0.8578274760383386, + "grad_norm": 0.08680932223796844, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1074 + }, + { + "epoch": 0.8586261980830671, + "grad_norm": 0.05679824575781822, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1075 + }, + { + "epoch": 0.8594249201277955, + "grad_norm": 0.07635466009378433, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1076 + }, + { + "epoch": 0.860223642172524, + "grad_norm": 0.08391202241182327, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 1077 + }, + { + "epoch": 0.8610223642172524, + "grad_norm": 0.044910602271556854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1078 + }, + { + "epoch": 0.8618210862619808, + "grad_norm": 0.07833745330572128, + "learning_rate": 0.0005, + "loss": 1.0365, + "step": 1079 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 0.11653397232294083, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1080 + }, + { + "epoch": 0.8634185303514377, + "grad_norm": 0.09041672199964523, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1081 + }, + { + "epoch": 0.8642172523961661, + "grad_norm": 0.061735767871141434, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1082 + }, + { + "epoch": 0.8650159744408946, + "grad_norm": 0.042857520282268524, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1083 + }, + { + "epoch": 0.865814696485623, + "grad_norm": 0.040145136415958405, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1084 + }, + { + "epoch": 0.8666134185303515, + "grad_norm": 0.05785573646426201, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1085 + }, + { + "epoch": 0.8674121405750799, + "grad_norm": 0.13503877818584442, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1086 + }, + { + "epoch": 0.8682108626198083, + "grad_norm": 0.16243800520896912, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1087 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.13211014866828918, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1088 + }, + { + "epoch": 0.8698083067092651, + "grad_norm": 0.08136262744665146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1089 + }, + { + "epoch": 0.8706070287539937, + "grad_norm": 0.07881205528974533, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1090 + }, + { + "epoch": 0.8714057507987221, + "grad_norm": 0.1660437136888504, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1091 + }, + { + "epoch": 0.8722044728434505, + "grad_norm": 0.1955040693283081, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1092 + }, + { + "epoch": 0.8730031948881789, + "grad_norm": 0.18039803206920624, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1093 + }, + { + "epoch": 0.8738019169329073, + "grad_norm": 0.13832250237464905, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1094 + }, + { + "epoch": 0.8746006389776357, + "grad_norm": 0.06982281059026718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1095 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 0.06607141345739365, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1096 + }, + { + "epoch": 0.8761980830670927, + "grad_norm": 0.08685869723558426, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1097 + }, + { + "epoch": 0.8769968051118211, + "grad_norm": 0.09157849103212357, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1098 + }, + { + "epoch": 0.8777955271565495, + "grad_norm": 0.05980607122182846, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1099 + }, + { + "epoch": 0.8785942492012779, + "grad_norm": 0.05037426948547363, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1100 + }, + { + "epoch": 0.8793929712460063, + "grad_norm": 0.09998175501823425, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 1101 + }, + { + "epoch": 0.8801916932907349, + "grad_norm": 0.14255133271217346, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1102 + }, + { + "epoch": 0.8809904153354633, + "grad_norm": 0.1332579255104065, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1103 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.06453413516283035, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1104 + }, + { + "epoch": 0.8825878594249201, + "grad_norm": 0.07107783854007721, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1105 + }, + { + "epoch": 0.8833865814696485, + "grad_norm": 0.14025849103927612, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1106 + }, + { + "epoch": 0.884185303514377, + "grad_norm": 0.18791186809539795, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1107 + }, + { + "epoch": 0.8849840255591054, + "grad_norm": 0.228570356965065, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1108 + }, + { + "epoch": 0.8857827476038339, + "grad_norm": 0.21574346721172333, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1109 + }, + { + "epoch": 0.8865814696485623, + "grad_norm": 0.14833906292915344, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1110 + }, + { + "epoch": 0.8873801916932907, + "grad_norm": 0.04756765812635422, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1111 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 0.13023658096790314, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1112 + }, + { + "epoch": 0.8889776357827476, + "grad_norm": 0.21199558675289154, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1113 + }, + { + "epoch": 0.889776357827476, + "grad_norm": 0.19635719060897827, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1114 + }, + { + "epoch": 0.8905750798722045, + "grad_norm": 0.14753709733486176, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1115 + }, + { + "epoch": 0.8913738019169329, + "grad_norm": 0.06639572232961655, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1116 + }, + { + "epoch": 0.8921725239616614, + "grad_norm": 0.09707840532064438, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1117 + }, + { + "epoch": 0.8929712460063898, + "grad_norm": 0.20057998597621918, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 1118 + }, + { + "epoch": 0.8937699680511182, + "grad_norm": 0.232718825340271, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1119 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.16340196132659912, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1120 + }, + { + "epoch": 0.895367412140575, + "grad_norm": 0.04553915560245514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 1121 + }, + { + "epoch": 0.8961661341853036, + "grad_norm": 0.12561571598052979, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1122 + }, + { + "epoch": 0.896964856230032, + "grad_norm": 0.19254666566848755, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1123 + }, + { + "epoch": 0.8977635782747604, + "grad_norm": 0.12862572073936462, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1124 + }, + { + "epoch": 0.8985623003194888, + "grad_norm": 0.051237158477306366, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1125 + }, + { + "epoch": 0.8993610223642172, + "grad_norm": 0.18603810667991638, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1126 + }, + { + "epoch": 0.9001597444089456, + "grad_norm": 0.2498294860124588, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1127 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 0.18809954822063446, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1128 + }, + { + "epoch": 0.9017571884984026, + "grad_norm": 0.06116599217057228, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1129 + }, + { + "epoch": 0.902555910543131, + "grad_norm": 0.07710137963294983, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1130 + }, + { + "epoch": 0.9033546325878594, + "grad_norm": 0.11208303272724152, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1131 + }, + { + "epoch": 0.9041533546325878, + "grad_norm": 0.11864814907312393, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1132 + }, + { + "epoch": 0.9049520766773163, + "grad_norm": 0.1261119246482849, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1133 + }, + { + "epoch": 0.9057507987220448, + "grad_norm": 0.10841526836156845, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1134 + }, + { + "epoch": 0.9065495207667732, + "grad_norm": 0.04871276393532753, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1135 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.08953645080327988, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1136 + }, + { + "epoch": 0.90814696485623, + "grad_norm": 0.1590365469455719, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1137 + }, + { + "epoch": 0.9089456869009584, + "grad_norm": 0.155691459774971, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1138 + }, + { + "epoch": 0.9097444089456869, + "grad_norm": 0.09982484579086304, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1139 + }, + { + "epoch": 0.9105431309904153, + "grad_norm": 0.08257611095905304, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1140 + }, + { + "epoch": 0.9113418530351438, + "grad_norm": 0.1036139577627182, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1141 + }, + { + "epoch": 0.9121405750798722, + "grad_norm": 0.06543707102537155, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1142 + }, + { + "epoch": 0.9129392971246006, + "grad_norm": 0.05375903844833374, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1143 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 0.13674795627593994, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1144 + }, + { + "epoch": 0.9145367412140575, + "grad_norm": 0.21575352549552917, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 1145 + }, + { + "epoch": 0.9153354632587859, + "grad_norm": 0.22478559613227844, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1146 + }, + { + "epoch": 0.9161341853035144, + "grad_norm": 0.1854555904865265, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1147 + }, + { + "epoch": 0.9169329073482428, + "grad_norm": 0.08605340123176575, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1148 + }, + { + "epoch": 0.9177316293929713, + "grad_norm": 0.14082656800746918, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1149 + }, + { + "epoch": 0.9185303514376997, + "grad_norm": 0.3214903771877289, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1150 + }, + { + "epoch": 0.9193290734824281, + "grad_norm": 0.4360012412071228, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 1151 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.3582250773906708, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1152 + }, + { + "epoch": 0.920926517571885, + "grad_norm": 0.1142783984541893, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1153 + }, + { + "epoch": 0.9217252396166135, + "grad_norm": 0.2035343497991562, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1154 + }, + { + "epoch": 0.9225239616613419, + "grad_norm": 0.3506172299385071, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1155 + }, + { + "epoch": 0.9233226837060703, + "grad_norm": 0.2129906564950943, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1156 + }, + { + "epoch": 0.9241214057507987, + "grad_norm": 0.12158108502626419, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1157 + }, + { + "epoch": 0.9249201277955271, + "grad_norm": 0.3931717872619629, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1158 + }, + { + "epoch": 0.9257188498402555, + "grad_norm": 0.36336907744407654, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 1159 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 0.06781382113695145, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1160 + }, + { + "epoch": 0.9273162939297125, + "grad_norm": 0.3335910141468048, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1161 + }, + { + "epoch": 0.9281150159744409, + "grad_norm": 0.5017055869102478, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 1162 + }, + { + "epoch": 0.9289137380191693, + "grad_norm": 0.3635455071926117, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1163 + }, + { + "epoch": 0.9297124600638977, + "grad_norm": 0.06748906522989273, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1164 + }, + { + "epoch": 0.9305111821086262, + "grad_norm": 0.3723882734775543, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1165 + }, + { + "epoch": 0.9313099041533547, + "grad_norm": 0.2976631820201874, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1166 + }, + { + "epoch": 0.9321086261980831, + "grad_norm": 0.06998804211616516, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1167 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.3307324945926666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1168 + }, + { + "epoch": 0.9337060702875399, + "grad_norm": 0.29726436734199524, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1169 + }, + { + "epoch": 0.9345047923322684, + "grad_norm": 0.048596691340208054, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1170 + }, + { + "epoch": 0.9353035143769968, + "grad_norm": 0.2840823233127594, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1171 + }, + { + "epoch": 0.9361022364217252, + "grad_norm": 0.31426292657852173, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 1172 + }, + { + "epoch": 0.9369009584664537, + "grad_norm": 0.16073261201381683, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1173 + }, + { + "epoch": 0.9376996805111821, + "grad_norm": 0.05725392326712608, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1174 + }, + { + "epoch": 0.9384984025559105, + "grad_norm": 0.1674586981534958, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1175 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 0.13738949596881866, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1176 + }, + { + "epoch": 0.9400958466453674, + "grad_norm": 0.05350235849618912, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1177 + }, + { + "epoch": 0.9408945686900958, + "grad_norm": 0.10518805682659149, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1178 + }, + { + "epoch": 0.9416932907348243, + "grad_norm": 0.11264974623918533, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1179 + }, + { + "epoch": 0.9424920127795527, + "grad_norm": 0.06757227331399918, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1180 + }, + { + "epoch": 0.9432907348242812, + "grad_norm": 0.07214303314685822, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1181 + }, + { + "epoch": 0.9440894568690096, + "grad_norm": 0.12705406546592712, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1182 + }, + { + "epoch": 0.944888178913738, + "grad_norm": 0.09937570244073868, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 1183 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 0.05628623813390732, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1184 + }, + { + "epoch": 0.9464856230031949, + "grad_norm": 0.05685505270957947, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1185 + }, + { + "epoch": 0.9472843450479234, + "grad_norm": 0.06150783598423004, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1186 + }, + { + "epoch": 0.9480830670926518, + "grad_norm": 0.04247362166643143, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1187 + }, + { + "epoch": 0.9488817891373802, + "grad_norm": 0.05664962902665138, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1188 + }, + { + "epoch": 0.9496805111821086, + "grad_norm": 0.07421324402093887, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1189 + }, + { + "epoch": 0.950479233226837, + "grad_norm": 0.043645020574331284, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1190 + }, + { + "epoch": 0.9512779552715654, + "grad_norm": 0.0692208856344223, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1191 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 0.13804891705513, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1192 + }, + { + "epoch": 0.9528753993610224, + "grad_norm": 0.14874884486198425, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1193 + }, + { + "epoch": 0.9536741214057508, + "grad_norm": 0.08449128270149231, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1194 + }, + { + "epoch": 0.9544728434504792, + "grad_norm": 0.035032968968153, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1195 + }, + { + "epoch": 0.9552715654952076, + "grad_norm": 0.10837965458631516, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1196 + }, + { + "epoch": 0.9560702875399361, + "grad_norm": 0.17972581088542938, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1197 + }, + { + "epoch": 0.9568690095846646, + "grad_norm": 0.17075787484645844, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1198 + }, + { + "epoch": 0.957667731629393, + "grad_norm": 0.08269231766462326, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1199 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.07269515842199326, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1200 + }, + { + "epoch": 0.9592651757188498, + "grad_norm": 0.15345947444438934, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 1201 + }, + { + "epoch": 0.9600638977635783, + "grad_norm": 0.19025452435016632, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1202 + }, + { + "epoch": 0.9608626198083067, + "grad_norm": 0.1782686710357666, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1203 + }, + { + "epoch": 0.9616613418530351, + "grad_norm": 0.1296931356191635, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1204 + }, + { + "epoch": 0.9624600638977636, + "grad_norm": 0.036208219826221466, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1205 + }, + { + "epoch": 0.963258785942492, + "grad_norm": 0.14282052218914032, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1206 + }, + { + "epoch": 0.9640575079872205, + "grad_norm": 0.26539498567581177, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1207 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 0.28352224826812744, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1208 + }, + { + "epoch": 0.9656549520766773, + "grad_norm": 0.14476369321346283, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1209 + }, + { + "epoch": 0.9664536741214057, + "grad_norm": 0.06859725713729858, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1210 + }, + { + "epoch": 0.9672523961661342, + "grad_norm": 0.19093726575374603, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1211 + }, + { + "epoch": 0.9680511182108626, + "grad_norm": 0.1848185807466507, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1212 + }, + { + "epoch": 0.9688498402555911, + "grad_norm": 0.05829976871609688, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1213 + }, + { + "epoch": 0.9696485623003195, + "grad_norm": 0.10105405002832413, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1214 + }, + { + "epoch": 0.9704472843450479, + "grad_norm": 0.12762011587619781, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1215 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.08238376677036285, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1216 + }, + { + "epoch": 0.9720447284345048, + "grad_norm": 0.07039444148540497, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1217 + }, + { + "epoch": 0.9728434504792333, + "grad_norm": 0.1320599615573883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1218 + }, + { + "epoch": 0.9736421725239617, + "grad_norm": 0.07799404859542847, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1219 + }, + { + "epoch": 0.9744408945686901, + "grad_norm": 0.11601961404085159, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1220 + }, + { + "epoch": 0.9752396166134185, + "grad_norm": 0.26134374737739563, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1221 + }, + { + "epoch": 0.9760383386581469, + "grad_norm": 0.275513231754303, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1222 + }, + { + "epoch": 0.9768370607028753, + "grad_norm": 0.0711631178855896, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1223 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 0.1879139244556427, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1224 + }, + { + "epoch": 0.9784345047923323, + "grad_norm": 0.24822647869586945, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1225 + }, + { + "epoch": 0.9792332268370607, + "grad_norm": 0.1244853138923645, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1226 + }, + { + "epoch": 0.9800319488817891, + "grad_norm": 0.07694529742002487, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1227 + }, + { + "epoch": 0.9808306709265175, + "grad_norm": 0.1280626803636551, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1228 + }, + { + "epoch": 0.981629392971246, + "grad_norm": 0.09127703309059143, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1229 + }, + { + "epoch": 0.9824281150159745, + "grad_norm": 0.06747932732105255, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1230 + }, + { + "epoch": 0.9832268370607029, + "grad_norm": 0.08196533471345901, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1231 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.09074689447879791, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1232 + }, + { + "epoch": 0.9848242811501597, + "grad_norm": 0.06031282991170883, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1233 + }, + { + "epoch": 0.9856230031948882, + "grad_norm": 0.07138215005397797, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1234 + }, + { + "epoch": 0.9864217252396166, + "grad_norm": 0.11056806892156601, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1235 + }, + { + "epoch": 0.987220447284345, + "grad_norm": 0.09108638018369675, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1236 + }, + { + "epoch": 0.9880191693290735, + "grad_norm": 0.0515020377933979, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1237 + }, + { + "epoch": 0.9888178913738019, + "grad_norm": 0.08467873930931091, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1238 + }, + { + "epoch": 0.9896166134185304, + "grad_norm": 0.10424523055553436, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1239 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 0.11506868153810501, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1240 + }, + { + "epoch": 0.9912140575079872, + "grad_norm": 0.13226476311683655, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1241 + }, + { + "epoch": 0.9920127795527156, + "grad_norm": 0.13714630901813507, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1242 + }, + { + "epoch": 0.9928115015974441, + "grad_norm": 0.08985403180122375, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1243 + }, + { + "epoch": 0.9936102236421726, + "grad_norm": 0.1107666939496994, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 1244 + }, + { + "epoch": 0.994408945686901, + "grad_norm": 0.130653515458107, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1245 + }, + { + "epoch": 0.9952076677316294, + "grad_norm": 0.10675778985023499, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1246 + }, + { + "epoch": 0.9960063897763578, + "grad_norm": 0.042045243084430695, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1247 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.07957674562931061, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1248 + }, + { + "epoch": 0.9976038338658147, + "grad_norm": 0.06926224380731583, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1249 + }, + { + "epoch": 0.9984025559105432, + "grad_norm": 0.0849846750497818, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1250 + }, + { + "epoch": 0.9992012779552716, + "grad_norm": 0.12501482665538788, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1251 + }, + { + "epoch": 1.0, + "grad_norm": 0.1467234194278717, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1252 + }, + { + "epoch": 1.0007987220447285, + "grad_norm": 0.11206725984811783, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1253 + }, + { + "epoch": 1.0015974440894568, + "grad_norm": 0.05224297568202019, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1254 + }, + { + "epoch": 1.0023961661341854, + "grad_norm": 0.15176911652088165, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1255 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.22419261932373047, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1256 + }, + { + "epoch": 1.0039936102236422, + "grad_norm": 0.18444369733333588, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1257 + }, + { + "epoch": 1.0047923322683705, + "grad_norm": 0.06510337442159653, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1258 + }, + { + "epoch": 1.005591054313099, + "grad_norm": 0.16058789193630219, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1259 + }, + { + "epoch": 1.0063897763578276, + "grad_norm": 0.22726313769817352, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1260 + }, + { + "epoch": 1.0071884984025559, + "grad_norm": 0.21050630509853363, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1261 + }, + { + "epoch": 1.0079872204472844, + "grad_norm": 0.09227188676595688, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1262 + }, + { + "epoch": 1.0087859424920127, + "grad_norm": 0.11473584920167923, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 1263 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 0.12692919373512268, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1264 + }, + { + "epoch": 1.0103833865814698, + "grad_norm": 0.056371819227933884, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1265 + }, + { + "epoch": 1.011182108626198, + "grad_norm": 0.13166245818138123, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1266 + }, + { + "epoch": 1.0119808306709266, + "grad_norm": 0.2606523633003235, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1267 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 0.320832759141922, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1268 + }, + { + "epoch": 1.0135782747603834, + "grad_norm": 0.2074427455663681, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1269 + }, + { + "epoch": 1.0143769968051117, + "grad_norm": 0.05768958851695061, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1270 + }, + { + "epoch": 1.0151757188498403, + "grad_norm": 0.08107002079486847, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1271 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 0.12996292114257812, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1272 + }, + { + "epoch": 1.016773162939297, + "grad_norm": 0.1514650285243988, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1273 + }, + { + "epoch": 1.0175718849840256, + "grad_norm": 0.1007395088672638, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1274 + }, + { + "epoch": 1.018370607028754, + "grad_norm": 0.0831306204199791, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1275 + }, + { + "epoch": 1.0191693290734825, + "grad_norm": 0.09004336595535278, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1276 + }, + { + "epoch": 1.0199680511182108, + "grad_norm": 0.06632232666015625, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1277 + }, + { + "epoch": 1.0207667731629393, + "grad_norm": 0.05073424428701401, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1278 + }, + { + "epoch": 1.0215654952076678, + "grad_norm": 0.06486333161592484, + "learning_rate": 0.0005, + "loss": 1.034, + "step": 1279 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 0.1137472614645958, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1280 + }, + { + "epoch": 1.0231629392971247, + "grad_norm": 0.08062250912189484, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1281 + }, + { + "epoch": 1.023961661341853, + "grad_norm": 0.05046350136399269, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1282 + }, + { + "epoch": 1.0247603833865815, + "grad_norm": 0.06503880023956299, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1283 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 0.10730332881212234, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1284 + }, + { + "epoch": 1.0263578274760383, + "grad_norm": 0.12077611684799194, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1285 + }, + { + "epoch": 1.0271565495207668, + "grad_norm": 0.15061219036579132, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1286 + }, + { + "epoch": 1.0279552715654952, + "grad_norm": 0.15091058611869812, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1287 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 0.07299874722957611, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1288 + }, + { + "epoch": 1.029552715654952, + "grad_norm": 0.09598413854837418, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1289 + }, + { + "epoch": 1.0303514376996805, + "grad_norm": 0.21661055088043213, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 1290 + }, + { + "epoch": 1.031150159744409, + "grad_norm": 0.24777255952358246, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1291 + }, + { + "epoch": 1.0319488817891374, + "grad_norm": 0.17097236216068268, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1292 + }, + { + "epoch": 1.0327476038338659, + "grad_norm": 0.05266748368740082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1293 + }, + { + "epoch": 1.0335463258785942, + "grad_norm": 0.12484195083379745, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1294 + }, + { + "epoch": 1.0343450479233227, + "grad_norm": 0.1802505999803543, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1295 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 0.10778877139091492, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1296 + }, + { + "epoch": 1.0359424920127795, + "grad_norm": 0.046645063906908035, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1297 + }, + { + "epoch": 1.036741214057508, + "grad_norm": 0.11727745085954666, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1298 + }, + { + "epoch": 1.0375399361022364, + "grad_norm": 0.1356390118598938, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1299 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 0.08130940794944763, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1300 + }, + { + "epoch": 1.0391373801916932, + "grad_norm": 0.07274319976568222, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1301 + }, + { + "epoch": 1.0399361022364217, + "grad_norm": 0.20339541137218475, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1302 + }, + { + "epoch": 1.04073482428115, + "grad_norm": 0.27819424867630005, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 1303 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 0.25879770517349243, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1304 + }, + { + "epoch": 1.042332268370607, + "grad_norm": 0.12683863937854767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1305 + }, + { + "epoch": 1.0431309904153354, + "grad_norm": 0.13531504571437836, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1306 + }, + { + "epoch": 1.043929712460064, + "grad_norm": 0.3203699588775635, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1307 + }, + { + "epoch": 1.0447284345047922, + "grad_norm": 0.3073630630970001, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1308 + }, + { + "epoch": 1.0455271565495208, + "grad_norm": 0.13184015452861786, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1309 + }, + { + "epoch": 1.0463258785942493, + "grad_norm": 0.1311715543270111, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1310 + }, + { + "epoch": 1.0471246006389776, + "grad_norm": 0.24470581114292145, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1311 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 0.21901719272136688, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1312 + }, + { + "epoch": 1.0487220447284344, + "grad_norm": 0.08105460554361343, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1313 + }, + { + "epoch": 1.049520766773163, + "grad_norm": 0.14864705502986908, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1314 + }, + { + "epoch": 1.0503194888178913, + "grad_norm": 0.20006732642650604, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1315 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 0.06233162060379982, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1316 + }, + { + "epoch": 1.0519169329073483, + "grad_norm": 0.12691672146320343, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1317 + }, + { + "epoch": 1.0527156549520766, + "grad_norm": 0.18303292989730835, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1318 + }, + { + "epoch": 1.0535143769968052, + "grad_norm": 0.13289928436279297, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1319 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 0.03847618028521538, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1320 + }, + { + "epoch": 1.055111821086262, + "grad_norm": 0.1317387968301773, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1321 + }, + { + "epoch": 1.0559105431309903, + "grad_norm": 0.1663348227739334, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1322 + }, + { + "epoch": 1.0567092651757188, + "grad_norm": 0.0657038614153862, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1323 + }, + { + "epoch": 1.0575079872204474, + "grad_norm": 0.1484680026769638, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1324 + }, + { + "epoch": 1.0583067092651757, + "grad_norm": 0.299824595451355, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1325 + }, + { + "epoch": 1.0591054313099042, + "grad_norm": 0.3598216772079468, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1326 + }, + { + "epoch": 1.0599041533546325, + "grad_norm": 0.25792455673217773, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1327 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 0.04925544187426567, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1328 + }, + { + "epoch": 1.0615015974440896, + "grad_norm": 0.2568669319152832, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1329 + }, + { + "epoch": 1.0623003194888179, + "grad_norm": 0.2679016590118408, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1330 + }, + { + "epoch": 1.0630990415335464, + "grad_norm": 0.12100119888782501, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1331 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 0.17324721813201904, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1332 + }, + { + "epoch": 1.0646964856230032, + "grad_norm": 0.34452658891677856, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1333 + }, + { + "epoch": 1.0654952076677315, + "grad_norm": 0.24561382830142975, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1334 + }, + { + "epoch": 1.06629392971246, + "grad_norm": 0.06080634891986847, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1335 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 0.249319925904274, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1336 + }, + { + "epoch": 1.067891373801917, + "grad_norm": 0.2586004436016083, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1337 + }, + { + "epoch": 1.0686900958466454, + "grad_norm": 0.07297322154045105, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1338 + }, + { + "epoch": 1.0694888178913737, + "grad_norm": 0.20853886008262634, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1339 + }, + { + "epoch": 1.0702875399361023, + "grad_norm": 0.3214154541492462, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1340 + }, + { + "epoch": 1.0710862619808306, + "grad_norm": 0.16169136762619019, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1341 + }, + { + "epoch": 1.071884984025559, + "grad_norm": 0.18989364802837372, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1342 + }, + { + "epoch": 1.0726837060702876, + "grad_norm": 0.42826735973358154, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1343 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 0.35387369990348816, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1344 + }, + { + "epoch": 1.0742811501597445, + "grad_norm": 0.061617862433195114, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 1345 + }, + { + "epoch": 1.0750798722044728, + "grad_norm": 0.3348129987716675, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1346 + }, + { + "epoch": 1.0758785942492013, + "grad_norm": 0.3622291088104248, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1347 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 0.12743668258190155, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1348 + }, + { + "epoch": 1.0774760383386581, + "grad_norm": 0.2464202642440796, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1349 + }, + { + "epoch": 1.0782747603833867, + "grad_norm": 0.3873802423477173, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1350 + }, + { + "epoch": 1.079073482428115, + "grad_norm": 0.22619839012622833, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1351 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 0.09080081433057785, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 1352 + }, + { + "epoch": 1.0806709265175718, + "grad_norm": 0.31380224227905273, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1353 + }, + { + "epoch": 1.0814696485623003, + "grad_norm": 0.2782067060470581, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1354 + }, + { + "epoch": 1.0822683706070289, + "grad_norm": 0.04267412796616554, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1355 + }, + { + "epoch": 1.0830670926517572, + "grad_norm": 0.2687273919582367, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1356 + }, + { + "epoch": 1.0838658146964857, + "grad_norm": 0.3133341073989868, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1357 + }, + { + "epoch": 1.084664536741214, + "grad_norm": 0.11658725887537003, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1358 + }, + { + "epoch": 1.0854632587859425, + "grad_norm": 0.1339937299489975, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1359 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 0.15727631747722626, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1360 + }, + { + "epoch": 1.0870607028753994, + "grad_norm": 0.11759792268276215, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1361 + }, + { + "epoch": 1.0878594249201279, + "grad_norm": 0.11522746086120605, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1362 + }, + { + "epoch": 1.0886581469648562, + "grad_norm": 0.16571135818958282, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1363 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 0.09467484056949615, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1364 + }, + { + "epoch": 1.090255591054313, + "grad_norm": 0.07887586951255798, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1365 + }, + { + "epoch": 1.0910543130990416, + "grad_norm": 0.11297929286956787, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1366 + }, + { + "epoch": 1.09185303514377, + "grad_norm": 0.06402980536222458, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1367 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 0.11947043240070343, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1368 + }, + { + "epoch": 1.093450479233227, + "grad_norm": 0.06244207173585892, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1369 + }, + { + "epoch": 1.0942492012779552, + "grad_norm": 0.08165531605482101, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1370 + }, + { + "epoch": 1.0950479233226837, + "grad_norm": 0.03842553123831749, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 1371 + }, + { + "epoch": 1.095846645367412, + "grad_norm": 0.12175651639699936, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1372 + }, + { + "epoch": 1.0966453674121406, + "grad_norm": 0.1720212697982788, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1373 + }, + { + "epoch": 1.097444089456869, + "grad_norm": 0.15540143847465515, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1374 + }, + { + "epoch": 1.0982428115015974, + "grad_norm": 0.1056036502122879, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1375 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 0.06738443672657013, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1376 + }, + { + "epoch": 1.0998402555910542, + "grad_norm": 0.09600193798542023, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1377 + }, + { + "epoch": 1.1006389776357828, + "grad_norm": 0.11872005462646484, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1378 + }, + { + "epoch": 1.101437699680511, + "grad_norm": 0.04837389290332794, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1379 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 0.11245802789926529, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1380 + }, + { + "epoch": 1.1030351437699681, + "grad_norm": 0.1525758057832718, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1381 + }, + { + "epoch": 1.1038338658146964, + "grad_norm": 0.07688060402870178, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1382 + }, + { + "epoch": 1.104632587859425, + "grad_norm": 0.05793362855911255, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1383 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 0.09737680107355118, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1384 + }, + { + "epoch": 1.1062300319488818, + "grad_norm": 0.15511851012706757, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1385 + }, + { + "epoch": 1.1070287539936103, + "grad_norm": 0.14931945502758026, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1386 + }, + { + "epoch": 1.1078274760383386, + "grad_norm": 0.1451406478881836, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1387 + }, + { + "epoch": 1.1086261980830672, + "grad_norm": 0.06013273820281029, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 1388 + }, + { + "epoch": 1.1094249201277955, + "grad_norm": 0.08433987945318222, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1389 + }, + { + "epoch": 1.110223642172524, + "grad_norm": 0.12601709365844727, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1390 + }, + { + "epoch": 1.1110223642172523, + "grad_norm": 0.14611507952213287, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1391 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 0.10526898503303528, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1392 + }, + { + "epoch": 1.1126198083067094, + "grad_norm": 0.03592250496149063, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1393 + }, + { + "epoch": 1.1134185303514377, + "grad_norm": 0.07883994281291962, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1394 + }, + { + "epoch": 1.1142172523961662, + "grad_norm": 0.1351863145828247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1395 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 0.10423804074525833, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1396 + }, + { + "epoch": 1.115814696485623, + "grad_norm": 0.05230586603283882, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1397 + }, + { + "epoch": 1.1166134185303513, + "grad_norm": 0.03962033987045288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1398 + }, + { + "epoch": 1.1174121405750799, + "grad_norm": 0.08950864523649216, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1399 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.1326761394739151, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1400 + }, + { + "epoch": 1.1190095846645367, + "grad_norm": 0.1251986175775528, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1401 + }, + { + "epoch": 1.1198083067092652, + "grad_norm": 0.05831597000360489, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1402 + }, + { + "epoch": 1.1206070287539935, + "grad_norm": 0.11382800340652466, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1403 + }, + { + "epoch": 1.121405750798722, + "grad_norm": 0.16290108859539032, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1404 + }, + { + "epoch": 1.1222044728434506, + "grad_norm": 0.1721554696559906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1405 + }, + { + "epoch": 1.123003194888179, + "grad_norm": 0.09426763653755188, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1406 + }, + { + "epoch": 1.1238019169329074, + "grad_norm": 0.037366580218076706, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1407 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 0.07456237077713013, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1408 + }, + { + "epoch": 1.1253993610223643, + "grad_norm": 0.11701856553554535, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1409 + }, + { + "epoch": 1.1261980830670926, + "grad_norm": 0.13261918723583221, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1410 + }, + { + "epoch": 1.126996805111821, + "grad_norm": 0.09014345705509186, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1411 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 0.05398619920015335, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1412 + }, + { + "epoch": 1.128594249201278, + "grad_norm": 0.09375960379838943, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1413 + }, + { + "epoch": 1.1293929712460065, + "grad_norm": 0.09307628124952316, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1414 + }, + { + "epoch": 1.1301916932907348, + "grad_norm": 0.09488195180892944, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1415 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 0.08067089319229126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1416 + }, + { + "epoch": 1.1317891373801916, + "grad_norm": 0.043899055570364, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1417 + }, + { + "epoch": 1.1325878594249201, + "grad_norm": 0.05593986064195633, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1418 + }, + { + "epoch": 1.1333865814696487, + "grad_norm": 0.05736452341079712, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1419 + }, + { + "epoch": 1.134185303514377, + "grad_norm": 0.1092999204993248, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1420 + }, + { + "epoch": 1.1349840255591055, + "grad_norm": 0.18366938829421997, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 1421 + }, + { + "epoch": 1.1357827476038338, + "grad_norm": 0.177176833152771, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1422 + }, + { + "epoch": 1.1365814696485623, + "grad_norm": 0.08829191327095032, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1423 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 0.07169382274150848, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1424 + }, + { + "epoch": 1.1381789137380192, + "grad_norm": 0.130388081073761, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1425 + }, + { + "epoch": 1.1389776357827477, + "grad_norm": 0.20726168155670166, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1426 + }, + { + "epoch": 1.139776357827476, + "grad_norm": 0.21683751046657562, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1427 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 0.131125345826149, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1428 + }, + { + "epoch": 1.1413738019169328, + "grad_norm": 0.04309925064444542, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1429 + }, + { + "epoch": 1.1421725239616614, + "grad_norm": 0.14427928626537323, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1430 + }, + { + "epoch": 1.1429712460063897, + "grad_norm": 0.1743481606245041, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1431 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 0.1037210002541542, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1432 + }, + { + "epoch": 1.1445686900958467, + "grad_norm": 0.11162228137254715, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1433 + }, + { + "epoch": 1.145367412140575, + "grad_norm": 0.25445371866226196, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1434 + }, + { + "epoch": 1.1461661341853036, + "grad_norm": 0.2771884799003601, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1435 + }, + { + "epoch": 1.1469648562300319, + "grad_norm": 0.10653509199619293, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1436 + }, + { + "epoch": 1.1477635782747604, + "grad_norm": 0.1745259016752243, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1437 + }, + { + "epoch": 1.148562300319489, + "grad_norm": 0.3151826560497284, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1438 + }, + { + "epoch": 1.1493610223642172, + "grad_norm": 0.23229722678661346, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1439 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 0.06131701543927193, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1440 + }, + { + "epoch": 1.150958466453674, + "grad_norm": 0.28753313422203064, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1441 + }, + { + "epoch": 1.1517571884984026, + "grad_norm": 0.3178791105747223, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1442 + }, + { + "epoch": 1.1525559105431311, + "grad_norm": 0.10008880496025085, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1443 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 0.2418096512556076, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1444 + }, + { + "epoch": 1.154153354632588, + "grad_norm": 0.34728583693504333, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1445 + }, + { + "epoch": 1.1549520766773163, + "grad_norm": 0.2172212153673172, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1446 + }, + { + "epoch": 1.1557507987220448, + "grad_norm": 0.04184277728199959, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1447 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 0.19960719347000122, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1448 + }, + { + "epoch": 1.1573482428115016, + "grad_norm": 0.19261692464351654, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1449 + }, + { + "epoch": 1.15814696485623, + "grad_norm": 0.08326124399900436, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1450 + }, + { + "epoch": 1.1589456869009584, + "grad_norm": 0.08552456647157669, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1451 + }, + { + "epoch": 1.159744408945687, + "grad_norm": 0.07903868705034256, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 1452 + }, + { + "epoch": 1.1605431309904153, + "grad_norm": 0.045095205307006836, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1453 + }, + { + "epoch": 1.1613418530351438, + "grad_norm": 0.08293266594409943, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1454 + }, + { + "epoch": 1.1621405750798721, + "grad_norm": 0.09431439638137817, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1455 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 0.04189104586839676, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 1456 + }, + { + "epoch": 1.1637380191693292, + "grad_norm": 0.11492408066987991, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1457 + }, + { + "epoch": 1.1645367412140575, + "grad_norm": 0.16648449003696442, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1458 + }, + { + "epoch": 1.165335463258786, + "grad_norm": 0.1532576084136963, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1459 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 0.07438737154006958, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 1460 + }, + { + "epoch": 1.1669329073482428, + "grad_norm": 0.0887872502207756, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 1461 + }, + { + "epoch": 1.1677316293929714, + "grad_norm": 0.17035096883773804, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1462 + }, + { + "epoch": 1.1685303514376997, + "grad_norm": 0.12702526152133942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1463 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 0.04788994789123535, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1464 + }, + { + "epoch": 1.1701277955271565, + "grad_norm": 0.15093912184238434, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1465 + }, + { + "epoch": 1.170926517571885, + "grad_norm": 0.1428089439868927, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1466 + }, + { + "epoch": 1.1717252396166133, + "grad_norm": 0.039421554654836655, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1467 + }, + { + "epoch": 1.1725239616613419, + "grad_norm": 0.09461840242147446, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1468 + }, + { + "epoch": 1.1733226837060702, + "grad_norm": 0.07272787392139435, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1469 + }, + { + "epoch": 1.1741214057507987, + "grad_norm": 0.10863790661096573, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1470 + }, + { + "epoch": 1.1749201277955272, + "grad_norm": 0.211805522441864, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1471 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 0.2124311476945877, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1472 + }, + { + "epoch": 1.176517571884984, + "grad_norm": 0.14013712108135223, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1473 + }, + { + "epoch": 1.1773162939297124, + "grad_norm": 0.10768178105354309, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1474 + }, + { + "epoch": 1.178115015974441, + "grad_norm": 0.07961699366569519, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1475 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 0.0772516280412674, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1476 + }, + { + "epoch": 1.1797124600638977, + "grad_norm": 0.11957084387540817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1477 + }, + { + "epoch": 1.1805111821086263, + "grad_norm": 0.1976107954978943, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1478 + }, + { + "epoch": 1.1813099041533546, + "grad_norm": 0.20915871858596802, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1479 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 0.10857495665550232, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1480 + }, + { + "epoch": 1.1829073482428114, + "grad_norm": 0.09961260855197906, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1481 + }, + { + "epoch": 1.18370607028754, + "grad_norm": 0.11908663064241409, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1482 + }, + { + "epoch": 1.1845047923322685, + "grad_norm": 0.0982719212770462, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1483 + }, + { + "epoch": 1.1853035143769968, + "grad_norm": 0.05869903787970543, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1484 + }, + { + "epoch": 1.1861022364217253, + "grad_norm": 0.14943145215511322, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1485 + }, + { + "epoch": 1.1869009584664536, + "grad_norm": 0.1761479526758194, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 1486 + }, + { + "epoch": 1.1876996805111821, + "grad_norm": 0.1393168866634369, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1487 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 0.0473988801240921, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1488 + }, + { + "epoch": 1.189297124600639, + "grad_norm": 0.20789027214050293, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1489 + }, + { + "epoch": 1.1900958466453675, + "grad_norm": 0.29456260800361633, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1490 + }, + { + "epoch": 1.1908945686900958, + "grad_norm": 0.1875244528055191, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1491 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 0.052052468061447144, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1492 + }, + { + "epoch": 1.1924920127795526, + "grad_norm": 0.1376652717590332, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1493 + }, + { + "epoch": 1.1932907348242812, + "grad_norm": 0.1656588762998581, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1494 + }, + { + "epoch": 1.1940894568690097, + "grad_norm": 0.07063707709312439, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 1495 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 0.12681347131729126, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1496 + }, + { + "epoch": 1.1956869009584665, + "grad_norm": 0.17560099065303802, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1497 + }, + { + "epoch": 1.1964856230031948, + "grad_norm": 0.10635025054216385, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1498 + }, + { + "epoch": 1.1972843450479234, + "grad_norm": 0.061567965894937515, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1499 + }, + { + "epoch": 1.1980830670926517, + "grad_norm": 0.12346719950437546, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1500 + }, + { + "epoch": 1.1988817891373802, + "grad_norm": 0.07105513662099838, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1501 + }, + { + "epoch": 1.1996805111821087, + "grad_norm": 0.07719466835260391, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1502 + }, + { + "epoch": 1.200479233226837, + "grad_norm": 0.1478763371706009, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1503 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 0.1383642554283142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1504 + }, + { + "epoch": 1.2020766773162939, + "grad_norm": 0.05519767478108406, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1505 + }, + { + "epoch": 1.2028753993610224, + "grad_norm": 0.06807537376880646, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1506 + }, + { + "epoch": 1.2036741214057507, + "grad_norm": 0.10652226209640503, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1507 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 0.044540517032146454, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1508 + }, + { + "epoch": 1.2052715654952078, + "grad_norm": 0.12266546487808228, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1509 + }, + { + "epoch": 1.206070287539936, + "grad_norm": 0.1997641921043396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1510 + }, + { + "epoch": 1.2068690095846646, + "grad_norm": 0.1924593299627304, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1511 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 0.09990391880273819, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 1512 + }, + { + "epoch": 1.2084664536741214, + "grad_norm": 0.04226391762495041, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1513 + }, + { + "epoch": 1.20926517571885, + "grad_norm": 0.07116132974624634, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1514 + }, + { + "epoch": 1.2100638977635783, + "grad_norm": 0.046046894043684006, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1515 + }, + { + "epoch": 1.2108626198083068, + "grad_norm": 0.039608217775821686, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1516 + }, + { + "epoch": 1.211661341853035, + "grad_norm": 0.055937573313713074, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1517 + }, + { + "epoch": 1.2124600638977636, + "grad_norm": 0.09269243478775024, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 1518 + }, + { + "epoch": 1.213258785942492, + "grad_norm": 0.04349381849169731, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1519 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 0.08543939888477325, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 1520 + }, + { + "epoch": 1.2148562300319488, + "grad_norm": 0.1829536110162735, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1521 + }, + { + "epoch": 1.2156549520766773, + "grad_norm": 0.23422624170780182, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 1522 + }, + { + "epoch": 1.2164536741214058, + "grad_norm": 0.13391408324241638, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1523 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 0.07262124121189117, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1524 + }, + { + "epoch": 1.2180511182108626, + "grad_norm": 0.1842898577451706, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1525 + }, + { + "epoch": 1.218849840255591, + "grad_norm": 0.16982080042362213, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1526 + }, + { + "epoch": 1.2196485623003195, + "grad_norm": 0.07628878951072693, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1527 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 0.07903175801038742, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1528 + }, + { + "epoch": 1.2212460063897763, + "grad_norm": 0.1874074637889862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1529 + }, + { + "epoch": 1.2220447284345048, + "grad_norm": 0.2084639072418213, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1530 + }, + { + "epoch": 1.2228434504792332, + "grad_norm": 0.161276176571846, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1531 + }, + { + "epoch": 1.2236421725239617, + "grad_norm": 0.07408371567726135, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1532 + }, + { + "epoch": 1.2244408945686902, + "grad_norm": 0.06918113678693771, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1533 + }, + { + "epoch": 1.2252396166134185, + "grad_norm": 0.15813148021697998, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1534 + }, + { + "epoch": 1.226038338658147, + "grad_norm": 0.1454530507326126, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 1535 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 0.07441768050193787, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1536 + }, + { + "epoch": 1.2276357827476039, + "grad_norm": 0.19151917099952698, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1537 + }, + { + "epoch": 1.2284345047923322, + "grad_norm": 0.22358526289463043, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1538 + }, + { + "epoch": 1.2292332268370607, + "grad_norm": 0.12382426857948303, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1539 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 0.09593929350376129, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1540 + }, + { + "epoch": 1.2308306709265175, + "grad_norm": 0.32887372374534607, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1541 + }, + { + "epoch": 1.231629392971246, + "grad_norm": 0.3910810351371765, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1542 + }, + { + "epoch": 1.2324281150159744, + "grad_norm": 0.21341568231582642, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1543 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 0.10242578387260437, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1544 + }, + { + "epoch": 1.2340255591054312, + "grad_norm": 0.2556541860103607, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 1.2348242811501597, + "grad_norm": 0.22671715915203094, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1546 + }, + { + "epoch": 1.2356230031948883, + "grad_norm": 0.05781029909849167, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1547 + }, + { + "epoch": 1.2364217252396166, + "grad_norm": 0.2803215980529785, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1548 + }, + { + "epoch": 1.237220447284345, + "grad_norm": 0.3391420543193817, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1549 + }, + { + "epoch": 1.2380191693290734, + "grad_norm": 0.17648665606975555, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1550 + }, + { + "epoch": 1.238817891373802, + "grad_norm": 0.14975208044052124, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1551 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 0.2930659353733063, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 1552 + }, + { + "epoch": 1.2404153354632588, + "grad_norm": 0.16080376505851746, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1553 + }, + { + "epoch": 1.2412140575079873, + "grad_norm": 0.1765553057193756, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1554 + }, + { + "epoch": 1.2420127795527156, + "grad_norm": 0.43610313534736633, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1555 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 0.3448547124862671, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1556 + }, + { + "epoch": 1.2436102236421724, + "grad_norm": 0.11257574707269669, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1557 + }, + { + "epoch": 1.244408945686901, + "grad_norm": 0.2212686389684677, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1558 + }, + { + "epoch": 1.2452076677316293, + "grad_norm": 0.24576987326145172, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 1559 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 0.07592078298330307, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1560 + }, + { + "epoch": 1.2468051118210863, + "grad_norm": 0.18566438555717468, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1561 + }, + { + "epoch": 1.2476038338658146, + "grad_norm": 0.2345304936170578, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1562 + }, + { + "epoch": 1.2484025559105432, + "grad_norm": 0.12168031930923462, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 1563 + }, + { + "epoch": 1.2492012779552715, + "grad_norm": 0.10168169438838959, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 1564 + }, + { + "epoch": 1.25, + "grad_norm": 0.14832071959972382, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1565 + }, + { + "epoch": 1.2507987220447285, + "grad_norm": 0.04516097158193588, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1566 + }, + { + "epoch": 1.2515974440894568, + "grad_norm": 0.14377422630786896, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1567 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 0.12483170628547668, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1568 + }, + { + "epoch": 1.2531948881789137, + "grad_norm": 0.06861971318721771, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 1569 + }, + { + "epoch": 1.2539936102236422, + "grad_norm": 0.1124153807759285, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1570 + }, + { + "epoch": 1.2547923322683707, + "grad_norm": 0.16883404552936554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1571 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 0.09533397108316422, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1572 + }, + { + "epoch": 1.2563897763578276, + "grad_norm": 0.09215923398733139, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1573 + }, + { + "epoch": 1.2571884984025559, + "grad_norm": 0.12701599299907684, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1574 + }, + { + "epoch": 1.2579872204472844, + "grad_norm": 0.09106232225894928, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1575 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 0.047954440116882324, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1576 + }, + { + "epoch": 1.2595846645367412, + "grad_norm": 0.13917528092861176, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1577 + }, + { + "epoch": 1.2603833865814695, + "grad_norm": 0.17694029211997986, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1578 + }, + { + "epoch": 1.261182108626198, + "grad_norm": 0.11021065711975098, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1579 + }, + { + "epoch": 1.2619808306709266, + "grad_norm": 0.03982831537723541, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1580 + }, + { + "epoch": 1.262779552715655, + "grad_norm": 0.08759493380784988, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 1581 + }, + { + "epoch": 1.2635782747603834, + "grad_norm": 0.04797520861029625, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1582 + }, + { + "epoch": 1.2643769968051117, + "grad_norm": 0.049942485988140106, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 1583 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 0.04236803576350212, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1584 + }, + { + "epoch": 1.2659744408945688, + "grad_norm": 0.05938104912638664, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1585 + }, + { + "epoch": 1.266773162939297, + "grad_norm": 0.07487885653972626, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1586 + }, + { + "epoch": 1.2675718849840256, + "grad_norm": 0.063072569668293, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1587 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 0.07140504568815231, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1588 + }, + { + "epoch": 1.2691693290734825, + "grad_norm": 0.04790132865309715, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1589 + }, + { + "epoch": 1.269968051118211, + "grad_norm": 0.050013668835163116, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1590 + }, + { + "epoch": 1.2707667731629393, + "grad_norm": 0.0559731163084507, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 1591 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 0.04633013904094696, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1592 + }, + { + "epoch": 1.2723642172523961, + "grad_norm": 0.05252271518111229, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1593 + }, + { + "epoch": 1.2731629392971247, + "grad_norm": 0.0902840718626976, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1594 + }, + { + "epoch": 1.273961661341853, + "grad_norm": 0.07961871474981308, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1595 + }, + { + "epoch": 1.2747603833865815, + "grad_norm": 0.07653608173131943, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1596 + }, + { + "epoch": 1.2755591054313098, + "grad_norm": 0.15634121000766754, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 1597 + }, + { + "epoch": 1.2763578274760383, + "grad_norm": 0.2045222818851471, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 1598 + }, + { + "epoch": 1.2771565495207668, + "grad_norm": 0.1769608110189438, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1599 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.09675133973360062, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1600 + }, + { + "epoch": 1.2787539936102237, + "grad_norm": 0.055832285434007645, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 1601 + }, + { + "epoch": 1.279552715654952, + "grad_norm": 0.09108291566371918, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1602 + }, + { + "epoch": 1.2803514376996805, + "grad_norm": 0.10872901976108551, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1603 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 0.08771848678588867, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 1604 + }, + { + "epoch": 1.2819488817891374, + "grad_norm": 0.0731026753783226, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 1605 + }, + { + "epoch": 1.2827476038338659, + "grad_norm": 0.040664345026016235, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1606 + }, + { + "epoch": 1.2835463258785942, + "grad_norm": 0.06111081317067146, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1607 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 0.08753795176744461, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1608 + }, + { + "epoch": 1.2851437699680512, + "grad_norm": 0.07113729417324066, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1609 + }, + { + "epoch": 1.2859424920127795, + "grad_norm": 0.05469372868537903, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1610 + }, + { + "epoch": 1.2867412140575079, + "grad_norm": 0.05748649686574936, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1611 + }, + { + "epoch": 1.2875399361022364, + "grad_norm": 0.05832446366548538, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1612 + }, + { + "epoch": 1.288338658146965, + "grad_norm": 0.06085522472858429, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1613 + }, + { + "epoch": 1.2891373801916932, + "grad_norm": 0.08154775947332382, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1614 + }, + { + "epoch": 1.2899361022364217, + "grad_norm": 0.11568816751241684, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 1615 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 0.06356564909219742, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1616 + }, + { + "epoch": 1.2915335463258786, + "grad_norm": 0.08187399804592133, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1617 + }, + { + "epoch": 1.292332268370607, + "grad_norm": 0.05326744168996811, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 1618 + }, + { + "epoch": 1.2931309904153354, + "grad_norm": 0.05407040938735008, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1619 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 0.07292867451906204, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 1620 + }, + { + "epoch": 1.2947284345047922, + "grad_norm": 0.09447437524795532, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1621 + }, + { + "epoch": 1.2955271565495208, + "grad_norm": 0.0592079721391201, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 1622 + }, + { + "epoch": 1.2963258785942493, + "grad_norm": 0.052008479833602905, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 1623 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 0.06381972879171371, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1624 + }, + { + "epoch": 1.2979233226837061, + "grad_norm": 0.07434900850057602, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1625 + }, + { + "epoch": 1.2987220447284344, + "grad_norm": 0.06477486342191696, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1626 + }, + { + "epoch": 1.299520766773163, + "grad_norm": 0.13730554282665253, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1627 + }, + { + "epoch": 1.3003194888178915, + "grad_norm": 0.1683935821056366, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1628 + }, + { + "epoch": 1.3011182108626198, + "grad_norm": 0.08616848289966583, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1629 + }, + { + "epoch": 1.3019169329073481, + "grad_norm": 0.10220590978860855, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1630 + }, + { + "epoch": 1.3027156549520766, + "grad_norm": 0.22036917507648468, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1631 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.2277965545654297, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1632 + }, + { + "epoch": 1.3043130990415335, + "grad_norm": 0.10426606982946396, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1633 + }, + { + "epoch": 1.305111821086262, + "grad_norm": 0.06641022861003876, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1634 + }, + { + "epoch": 1.3059105431309903, + "grad_norm": 0.09100072830915451, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1635 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 0.06551069766283035, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 1636 + }, + { + "epoch": 1.3075079872204474, + "grad_norm": 0.04397547245025635, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1637 + }, + { + "epoch": 1.3083067092651757, + "grad_norm": 0.0781746581196785, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1638 + }, + { + "epoch": 1.3091054313099042, + "grad_norm": 0.07852843403816223, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 1639 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 0.09224545955657959, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1640 + }, + { + "epoch": 1.310702875399361, + "grad_norm": 0.10179189592599869, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1641 + }, + { + "epoch": 1.3115015974440896, + "grad_norm": 0.07562009245157242, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1642 + }, + { + "epoch": 1.3123003194888179, + "grad_norm": 0.15463820099830627, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1643 + }, + { + "epoch": 1.3130990415335464, + "grad_norm": 0.05742334946990013, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 1644 + }, + { + "epoch": 1.3138977635782747, + "grad_norm": 0.09010195732116699, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1645 + }, + { + "epoch": 1.3146964856230032, + "grad_norm": 0.04284297674894333, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1646 + }, + { + "epoch": 1.3154952076677318, + "grad_norm": 0.07167239487171173, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1647 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 0.04978404566645622, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1648 + }, + { + "epoch": 1.3170926517571884, + "grad_norm": 0.2888668477535248, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1649 + }, + { + "epoch": 1.317891373801917, + "grad_norm": 0.13716880977153778, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1650 + }, + { + "epoch": 1.3186900958466454, + "grad_norm": 0.13081762194633484, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1651 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 0.046977054327726364, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1652 + }, + { + "epoch": 1.3202875399361023, + "grad_norm": 0.1331615000963211, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 1653 + }, + { + "epoch": 1.3210862619808306, + "grad_norm": 0.21066126227378845, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 1654 + }, + { + "epoch": 1.321884984025559, + "grad_norm": 0.23017194867134094, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1655 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 0.20224629342556, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 1656 + }, + { + "epoch": 1.323482428115016, + "grad_norm": 0.09836700558662415, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1657 + }, + { + "epoch": 1.3242811501597445, + "grad_norm": 0.10621663928031921, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1658 + }, + { + "epoch": 1.3250798722044728, + "grad_norm": 0.25464868545532227, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1659 + }, + { + "epoch": 1.3258785942492013, + "grad_norm": 0.39965251088142395, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1660 + }, + { + "epoch": 1.3266773162939298, + "grad_norm": 0.4731796383857727, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1661 + }, + { + "epoch": 1.3274760383386581, + "grad_norm": 0.4287014603614807, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1662 + }, + { + "epoch": 1.3282747603833867, + "grad_norm": 0.15660974383354187, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1663 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.14340882003307343, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1664 + }, + { + "epoch": 1.3298722044728435, + "grad_norm": 0.23041795194149017, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1665 + }, + { + "epoch": 1.330670926517572, + "grad_norm": 0.14607569575309753, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1666 + }, + { + "epoch": 1.3314696485623003, + "grad_norm": 0.0620175264775753, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1667 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 0.1722227782011032, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1668 + }, + { + "epoch": 1.3330670926517572, + "grad_norm": 0.17676329612731934, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1669 + }, + { + "epoch": 1.3338658146964857, + "grad_norm": 0.10175948590040207, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1670 + }, + { + "epoch": 1.334664536741214, + "grad_norm": 0.052259646356105804, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1671 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 0.11740414053201675, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1672 + }, + { + "epoch": 1.3362619808306708, + "grad_norm": 0.13614653050899506, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1673 + }, + { + "epoch": 1.3370607028753994, + "grad_norm": 0.12058388441801071, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 1674 + }, + { + "epoch": 1.3378594249201279, + "grad_norm": 0.12473122030496597, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1675 + }, + { + "epoch": 1.3386581469648562, + "grad_norm": 0.11198705434799194, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1676 + }, + { + "epoch": 1.3394568690095847, + "grad_norm": 0.06745828688144684, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1677 + }, + { + "epoch": 1.340255591054313, + "grad_norm": 0.06042877584695816, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1678 + }, + { + "epoch": 1.3410543130990416, + "grad_norm": 0.08762289583683014, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1679 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 0.07612926512956619, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1680 + }, + { + "epoch": 1.3426517571884984, + "grad_norm": 0.16108228266239166, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1681 + }, + { + "epoch": 1.343450479233227, + "grad_norm": 0.12803438305854797, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1682 + }, + { + "epoch": 1.3442492012779552, + "grad_norm": 0.09190207719802856, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1683 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 0.07201807200908661, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1684 + }, + { + "epoch": 1.3458466453674123, + "grad_norm": 0.06885793805122375, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 1685 + }, + { + "epoch": 1.3466453674121406, + "grad_norm": 0.06998719274997711, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1686 + }, + { + "epoch": 1.3474440894568689, + "grad_norm": 0.08072122186422348, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1687 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 0.1314389705657959, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1688 + }, + { + "epoch": 1.349041533546326, + "grad_norm": 0.1393643617630005, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1689 + }, + { + "epoch": 1.3498402555910542, + "grad_norm": 0.1482846736907959, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1690 + }, + { + "epoch": 1.3506389776357828, + "grad_norm": 0.10097873955965042, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1691 + }, + { + "epoch": 1.351437699680511, + "grad_norm": 0.16020123660564423, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 1692 + }, + { + "epoch": 1.3522364217252396, + "grad_norm": 0.4032374322414398, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1693 + }, + { + "epoch": 1.3530351437699681, + "grad_norm": 0.21653197705745697, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1694 + }, + { + "epoch": 1.3538338658146964, + "grad_norm": 0.18634478747844696, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1695 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 0.06293921917676926, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1696 + }, + { + "epoch": 1.3554313099041533, + "grad_norm": 0.09862471371889114, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1697 + }, + { + "epoch": 1.3562300319488818, + "grad_norm": 0.17562821507453918, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 1698 + }, + { + "epoch": 1.3570287539936103, + "grad_norm": 0.17277459800243378, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1699 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 0.06883158534765244, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1700 + }, + { + "epoch": 1.3586261980830672, + "grad_norm": 0.06487718969583511, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1701 + }, + { + "epoch": 1.3594249201277955, + "grad_norm": 0.08988886326551437, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1702 + }, + { + "epoch": 1.360223642172524, + "grad_norm": 0.05164919048547745, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1703 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 0.143778458237648, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 1704 + }, + { + "epoch": 1.3618210862619808, + "grad_norm": 0.21736390888690948, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1705 + }, + { + "epoch": 1.3626198083067091, + "grad_norm": 0.2496086061000824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1706 + }, + { + "epoch": 1.3634185303514377, + "grad_norm": 0.21299317479133606, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1707 + }, + { + "epoch": 1.3642172523961662, + "grad_norm": 0.06845723092556, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 1708 + }, + { + "epoch": 1.3650159744408945, + "grad_norm": 0.14018614590168, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1709 + }, + { + "epoch": 1.365814696485623, + "grad_norm": 0.1971539407968521, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 1710 + }, + { + "epoch": 1.3666134185303513, + "grad_norm": 0.10819724202156067, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 1711 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 0.12900666892528534, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1712 + }, + { + "epoch": 1.3682108626198084, + "grad_norm": 0.17080886662006378, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1713 + }, + { + "epoch": 1.3690095846645367, + "grad_norm": 0.22689902782440186, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1714 + }, + { + "epoch": 1.3698083067092652, + "grad_norm": 0.2200036197900772, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1715 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 0.15193268656730652, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 1716 + }, + { + "epoch": 1.371405750798722, + "grad_norm": 0.057297177612781525, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 1717 + }, + { + "epoch": 1.3722044728434506, + "grad_norm": 0.12024576961994171, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 1718 + }, + { + "epoch": 1.373003194888179, + "grad_norm": 0.16183575987815857, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1719 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 0.14740106463432312, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 1720 + }, + { + "epoch": 1.3746006389776357, + "grad_norm": 0.09009548276662827, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 1721 + }, + { + "epoch": 1.3753993610223643, + "grad_norm": 0.05091484636068344, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1722 + }, + { + "epoch": 1.3761980830670926, + "grad_norm": 0.05887647345662117, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1723 + }, + { + "epoch": 1.376996805111821, + "grad_norm": 0.06313642859458923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1724 + }, + { + "epoch": 1.3777955271565494, + "grad_norm": 0.06496263295412064, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 1725 + }, + { + "epoch": 1.378594249201278, + "grad_norm": 0.06047922000288963, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1726 + }, + { + "epoch": 1.3793929712460065, + "grad_norm": 0.05579136312007904, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 1727 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 0.05931869521737099, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 1728 + }, + { + "epoch": 1.3809904153354633, + "grad_norm": 0.049043234437704086, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1729 + }, + { + "epoch": 1.3817891373801916, + "grad_norm": 0.051883842796087265, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1730 + }, + { + "epoch": 1.3825878594249201, + "grad_norm": 0.07195441424846649, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1731 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 0.12339463829994202, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1732 + }, + { + "epoch": 1.384185303514377, + "grad_norm": 0.16951170563697815, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1733 + }, + { + "epoch": 1.3849840255591055, + "grad_norm": 0.1773078590631485, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1734 + }, + { + "epoch": 1.3857827476038338, + "grad_norm": 0.15160880982875824, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1735 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 0.12933489680290222, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1736 + }, + { + "epoch": 1.3873801916932909, + "grad_norm": 0.05910791456699371, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1737 + }, + { + "epoch": 1.3881789137380192, + "grad_norm": 0.06765501946210861, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1738 + }, + { + "epoch": 1.3889776357827475, + "grad_norm": 0.09179043024778366, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1739 + }, + { + "epoch": 1.389776357827476, + "grad_norm": 0.08842387795448303, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 1740 + }, + { + "epoch": 1.3905750798722045, + "grad_norm": 0.07700884342193604, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1741 + }, + { + "epoch": 1.3913738019169328, + "grad_norm": 0.045392196625471115, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1742 + }, + { + "epoch": 1.3921725239616614, + "grad_norm": 0.11977320909500122, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1743 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 0.1882479041814804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 1744 + }, + { + "epoch": 1.3937699680511182, + "grad_norm": 0.25021475553512573, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1745 + }, + { + "epoch": 1.3945686900958467, + "grad_norm": 0.23374556005001068, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1746 + }, + { + "epoch": 1.395367412140575, + "grad_norm": 0.1016339659690857, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1747 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 0.1340985745191574, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1748 + }, + { + "epoch": 1.3969648562300319, + "grad_norm": 0.21048963069915771, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1749 + }, + { + "epoch": 1.3977635782747604, + "grad_norm": 0.20711666345596313, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1750 + }, + { + "epoch": 1.398562300319489, + "grad_norm": 0.19101384282112122, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 1751 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 0.17655788362026215, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1752 + }, + { + "epoch": 1.4001597444089458, + "grad_norm": 0.11994078010320663, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 1753 + }, + { + "epoch": 1.400958466453674, + "grad_norm": 0.09805315732955933, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1754 + }, + { + "epoch": 1.4017571884984026, + "grad_norm": 0.07474519312381744, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 1755 + }, + { + "epoch": 1.4025559105431311, + "grad_norm": 0.11269772797822952, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1756 + }, + { + "epoch": 1.4033546325878594, + "grad_norm": 0.08900775015354156, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1757 + }, + { + "epoch": 1.4041533546325877, + "grad_norm": 0.05614674836397171, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1758 + }, + { + "epoch": 1.4049520766773163, + "grad_norm": 0.12895621359348297, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1759 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 0.16433797776699066, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 1760 + }, + { + "epoch": 1.406549520766773, + "grad_norm": 0.20009422302246094, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1761 + }, + { + "epoch": 1.4073482428115016, + "grad_norm": 0.146495059132576, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1762 + }, + { + "epoch": 1.40814696485623, + "grad_norm": 0.07518120110034943, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1763 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 0.09864111244678497, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 1764 + }, + { + "epoch": 1.409744408945687, + "grad_norm": 0.20213425159454346, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 1765 + }, + { + "epoch": 1.4105431309904153, + "grad_norm": 0.17369656264781952, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1766 + }, + { + "epoch": 1.4113418530351438, + "grad_norm": 0.06627536565065384, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1767 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 0.09098218381404877, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 1768 + }, + { + "epoch": 1.4129392971246006, + "grad_norm": 0.11730248481035233, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1769 + }, + { + "epoch": 1.4137380191693292, + "grad_norm": 0.07061973959207535, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 1770 + }, + { + "epoch": 1.4145367412140575, + "grad_norm": 0.10279946774244308, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 1771 + }, + { + "epoch": 1.415335463258786, + "grad_norm": 0.18082919716835022, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1772 + }, + { + "epoch": 1.4161341853035143, + "grad_norm": 0.1592867076396942, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1773 + }, + { + "epoch": 1.4169329073482428, + "grad_norm": 0.09976492077112198, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1774 + }, + { + "epoch": 1.4177316293929714, + "grad_norm": 0.060737378895282745, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 1775 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 0.06248186528682709, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1776 + }, + { + "epoch": 1.419329073482428, + "grad_norm": 0.13300968706607819, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 1777 + }, + { + "epoch": 1.4201277955271565, + "grad_norm": 0.1979697346687317, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 1778 + }, + { + "epoch": 1.420926517571885, + "grad_norm": 0.23268306255340576, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1779 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 0.18313626945018768, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1780 + }, + { + "epoch": 1.4225239616613419, + "grad_norm": 0.08110051602125168, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1781 + }, + { + "epoch": 1.4233226837060702, + "grad_norm": 0.09732743352651596, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1782 + }, + { + "epoch": 1.4241214057507987, + "grad_norm": 0.1656067669391632, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1783 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 0.1959427297115326, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1784 + }, + { + "epoch": 1.4257188498402555, + "grad_norm": 0.17609809339046478, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 1785 + }, + { + "epoch": 1.426517571884984, + "grad_norm": 0.0999840646982193, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1786 + }, + { + "epoch": 1.4273162939297124, + "grad_norm": 0.06475909799337387, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1787 + }, + { + "epoch": 1.428115015974441, + "grad_norm": 0.1364496946334839, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 1788 + }, + { + "epoch": 1.4289137380191694, + "grad_norm": 0.21113638579845428, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 1789 + }, + { + "epoch": 1.4297124600638977, + "grad_norm": 0.25998085737228394, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1790 + }, + { + "epoch": 1.4305111821086263, + "grad_norm": 0.24930700659751892, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1791 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 0.131307452917099, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1792 + }, + { + "epoch": 1.432108626198083, + "grad_norm": 0.0739457756280899, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 1793 + }, + { + "epoch": 1.4329073482428116, + "grad_norm": 0.2009744644165039, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 1794 + }, + { + "epoch": 1.43370607028754, + "grad_norm": 0.28875023126602173, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1795 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 0.25421038269996643, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1796 + }, + { + "epoch": 1.4353035143769968, + "grad_norm": 0.09670932590961456, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1797 + }, + { + "epoch": 1.4361022364217253, + "grad_norm": 0.11264955252408981, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1798 + }, + { + "epoch": 1.4369009584664536, + "grad_norm": 0.1401909440755844, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1799 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 0.08234099298715591, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 1800 + }, + { + "epoch": 1.4384984025559104, + "grad_norm": 0.05028436705470085, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1801 + }, + { + "epoch": 1.439297124600639, + "grad_norm": 0.04673704132437706, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1802 + }, + { + "epoch": 1.4400958466453675, + "grad_norm": 0.07369101047515869, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 1803 + }, + { + "epoch": 1.4408945686900958, + "grad_norm": 0.161424919962883, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1804 + }, + { + "epoch": 1.4416932907348243, + "grad_norm": 0.13576306402683258, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1805 + }, + { + "epoch": 1.4424920127795526, + "grad_norm": 0.063505619764328, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 1806 + }, + { + "epoch": 1.4432907348242812, + "grad_norm": 0.07231617718935013, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1807 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 0.1698617935180664, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 1808 + }, + { + "epoch": 1.444888178913738, + "grad_norm": 0.16520395874977112, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 1809 + }, + { + "epoch": 1.4456869009584665, + "grad_norm": 0.058485522866249084, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 1810 + }, + { + "epoch": 1.4464856230031948, + "grad_norm": 0.0816773921251297, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1811 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 0.15307661890983582, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1812 + }, + { + "epoch": 1.4480830670926519, + "grad_norm": 0.20710408687591553, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 1813 + }, + { + "epoch": 1.4488817891373802, + "grad_norm": 0.1786869764328003, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1814 + }, + { + "epoch": 1.4496805111821085, + "grad_norm": 0.07363469898700714, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 1815 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 0.10158272087574005, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 1816 + }, + { + "epoch": 1.4512779552715656, + "grad_norm": 0.14304493367671967, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1817 + }, + { + "epoch": 1.4520766773162939, + "grad_norm": 0.11782495677471161, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 1818 + }, + { + "epoch": 1.4528753993610224, + "grad_norm": 0.09340433776378632, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 1819 + }, + { + "epoch": 1.4536741214057507, + "grad_norm": 0.08881603926420212, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 1820 + }, + { + "epoch": 1.4544728434504792, + "grad_norm": 0.1377323865890503, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1821 + }, + { + "epoch": 1.4552715654952078, + "grad_norm": 0.1137915700674057, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 1822 + }, + { + "epoch": 1.456070287539936, + "grad_norm": 0.08219580352306366, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1823 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 0.048282165080308914, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 1824 + }, + { + "epoch": 1.457667731629393, + "grad_norm": 0.07061316817998886, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1825 + }, + { + "epoch": 1.4584664536741214, + "grad_norm": 0.09383007138967514, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1826 + }, + { + "epoch": 1.45926517571885, + "grad_norm": 0.10688310861587524, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 1827 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 0.09751323610544205, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 1828 + }, + { + "epoch": 1.4608626198083068, + "grad_norm": 0.10437846183776855, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 1829 + }, + { + "epoch": 1.461661341853035, + "grad_norm": 0.13903124630451202, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 1830 + }, + { + "epoch": 1.4624600638977636, + "grad_norm": 0.09480495005846024, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1831 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 0.062304843217134476, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1832 + }, + { + "epoch": 1.4640575079872205, + "grad_norm": 0.13482356071472168, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 1833 + }, + { + "epoch": 1.4648562300319488, + "grad_norm": 0.2302182912826538, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 1834 + }, + { + "epoch": 1.4656549520766773, + "grad_norm": 0.28565964102745056, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1835 + }, + { + "epoch": 1.4664536741214058, + "grad_norm": 0.28437626361846924, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1836 + }, + { + "epoch": 1.4672523961661341, + "grad_norm": 0.20637334883213043, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 1837 + }, + { + "epoch": 1.4680511182108626, + "grad_norm": 0.08829299360513687, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1838 + }, + { + "epoch": 1.468849840255591, + "grad_norm": 0.06338132172822952, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 1839 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 0.13094602525234222, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 1840 + }, + { + "epoch": 1.470447284345048, + "grad_norm": 0.15911467373371124, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 1841 + }, + { + "epoch": 1.4712460063897763, + "grad_norm": 0.10913829505443573, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 1842 + }, + { + "epoch": 1.4720447284345048, + "grad_norm": 0.06934744864702225, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1843 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.07930968701839447, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1844 + }, + { + "epoch": 1.4736421725239617, + "grad_norm": 0.11225491017103195, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1845 + }, + { + "epoch": 1.4744408945686902, + "grad_norm": 0.12815739214420319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1846 + }, + { + "epoch": 1.4752396166134185, + "grad_norm": 0.0943179577589035, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1847 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 0.051353566348552704, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1848 + }, + { + "epoch": 1.4768370607028753, + "grad_norm": 0.10284367203712463, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1849 + }, + { + "epoch": 1.4776357827476039, + "grad_norm": 0.18345551192760468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1850 + }, + { + "epoch": 1.4784345047923324, + "grad_norm": 0.19532762467861176, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 1851 + }, + { + "epoch": 1.4792332268370607, + "grad_norm": 0.12518467009067535, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1852 + }, + { + "epoch": 1.480031948881789, + "grad_norm": 0.05363085865974426, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1853 + }, + { + "epoch": 1.4808306709265175, + "grad_norm": 0.18222568929195404, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 1854 + }, + { + "epoch": 1.481629392971246, + "grad_norm": 0.19992542266845703, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1855 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 0.1724570095539093, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 1856 + }, + { + "epoch": 1.483226837060703, + "grad_norm": 0.04096012935042381, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 1857 + }, + { + "epoch": 1.4840255591054312, + "grad_norm": 0.15409474074840546, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1858 + }, + { + "epoch": 1.4848242811501597, + "grad_norm": 0.29238876700401306, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1859 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 0.35619401931762695, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1860 + }, + { + "epoch": 1.4864217252396166, + "grad_norm": 0.2790282964706421, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1861 + }, + { + "epoch": 1.487220447284345, + "grad_norm": 0.0809629037976265, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1862 + }, + { + "epoch": 1.4880191693290734, + "grad_norm": 0.1827513724565506, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1863 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 0.2284395545721054, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1864 + }, + { + "epoch": 1.4896166134185305, + "grad_norm": 0.11697912216186523, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1865 + }, + { + "epoch": 1.4904153354632588, + "grad_norm": 0.08668534457683563, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 1866 + }, + { + "epoch": 1.4912140575079873, + "grad_norm": 0.19793611764907837, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 1867 + }, + { + "epoch": 1.4920127795527156, + "grad_norm": 0.18775872886180878, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1868 + }, + { + "epoch": 1.4928115015974441, + "grad_norm": 0.07068412005901337, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1869 + }, + { + "epoch": 1.4936102236421724, + "grad_norm": 0.07640416920185089, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 1870 + }, + { + "epoch": 1.494408945686901, + "grad_norm": 0.1333264708518982, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 1871 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 0.13000380992889404, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1872 + }, + { + "epoch": 1.4960063897763578, + "grad_norm": 0.05382491648197174, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 1873 + }, + { + "epoch": 1.4968051118210863, + "grad_norm": 0.12773285806179047, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1874 + }, + { + "epoch": 1.4976038338658146, + "grad_norm": 0.2441176027059555, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1875 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 0.26628851890563965, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 1876 + }, + { + "epoch": 1.4992012779552715, + "grad_norm": 0.1295953392982483, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 1877 + }, + { + "epoch": 1.5, + "grad_norm": 0.10860511660575867, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 1878 + }, + { + "epoch": 1.5007987220447285, + "grad_norm": 0.25177180767059326, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1879 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 0.2379150688648224, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 1880 + }, + { + "epoch": 1.5023961661341851, + "grad_norm": 0.101965993642807, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 1881 + }, + { + "epoch": 1.5031948881789137, + "grad_norm": 0.15633052587509155, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 1882 + }, + { + "epoch": 1.5039936102236422, + "grad_norm": 0.3071416914463043, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1883 + }, + { + "epoch": 1.5047923322683707, + "grad_norm": 0.2126736044883728, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 1884 + }, + { + "epoch": 1.505591054313099, + "grad_norm": 0.05252298340201378, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1885 + }, + { + "epoch": 1.5063897763578273, + "grad_norm": 0.23854316771030426, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1886 + }, + { + "epoch": 1.5071884984025559, + "grad_norm": 0.305148720741272, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1887 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 0.1371227502822876, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 1888 + }, + { + "epoch": 1.508785942492013, + "grad_norm": 0.16433516144752502, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1889 + }, + { + "epoch": 1.5095846645367412, + "grad_norm": 0.24010877311229706, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1890 + }, + { + "epoch": 1.5103833865814695, + "grad_norm": 0.12839943170547485, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1891 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 0.055945366621017456, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 1892 + }, + { + "epoch": 1.5119808306709266, + "grad_norm": 0.16645023226737976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 1893 + }, + { + "epoch": 1.5127795527156551, + "grad_norm": 0.14626996219158173, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 1894 + }, + { + "epoch": 1.5135782747603834, + "grad_norm": 0.04274629056453705, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 1895 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 0.10497253388166428, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 1896 + }, + { + "epoch": 1.5151757188498403, + "grad_norm": 0.159364715218544, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 1897 + }, + { + "epoch": 1.5159744408945688, + "grad_norm": 0.11409968137741089, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1898 + }, + { + "epoch": 1.516773162939297, + "grad_norm": 0.03989424183964729, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1899 + }, + { + "epoch": 1.5175718849840254, + "grad_norm": 0.12703374028205872, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1900 + }, + { + "epoch": 1.518370607028754, + "grad_norm": 0.20534875988960266, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 1901 + }, + { + "epoch": 1.5191693290734825, + "grad_norm": 0.2276938110589981, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1902 + }, + { + "epoch": 1.519968051118211, + "grad_norm": 0.114278644323349, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1903 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 0.08295118063688278, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 1904 + }, + { + "epoch": 1.5215654952076676, + "grad_norm": 0.18610796332359314, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1905 + }, + { + "epoch": 1.5223642172523961, + "grad_norm": 0.1920524388551712, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 1906 + }, + { + "epoch": 1.5231629392971247, + "grad_norm": 0.06447675824165344, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1907 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 0.17821159958839417, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 1908 + }, + { + "epoch": 1.5247603833865815, + "grad_norm": 0.23894363641738892, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1909 + }, + { + "epoch": 1.5255591054313098, + "grad_norm": 0.14711391925811768, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1910 + }, + { + "epoch": 1.5263578274760383, + "grad_norm": 0.07863837480545044, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 1911 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 0.20990678668022156, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 1912 + }, + { + "epoch": 1.5279552715654952, + "grad_norm": 0.19979886710643768, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1913 + }, + { + "epoch": 1.5287539936102237, + "grad_norm": 0.0871618464589119, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 1914 + }, + { + "epoch": 1.529552715654952, + "grad_norm": 0.09294576942920685, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 1915 + }, + { + "epoch": 1.5303514376996805, + "grad_norm": 0.23010258376598358, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 1916 + }, + { + "epoch": 1.531150159744409, + "grad_norm": 0.2919708788394928, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 1917 + }, + { + "epoch": 1.5319488817891374, + "grad_norm": 0.21767428517341614, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 1918 + }, + { + "epoch": 1.5327476038338657, + "grad_norm": 0.07844182848930359, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 1919 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 0.14891114830970764, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 1920 + }, + { + "epoch": 1.5343450479233227, + "grad_norm": 0.17959977686405182, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 1921 + }, + { + "epoch": 1.5351437699680512, + "grad_norm": 0.10217028856277466, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 1922 + }, + { + "epoch": 1.5359424920127795, + "grad_norm": 0.08135818690061569, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 1923 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 0.19660547375679016, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1924 + }, + { + "epoch": 1.5375399361022364, + "grad_norm": 0.2106354534626007, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1925 + }, + { + "epoch": 1.538338658146965, + "grad_norm": 0.11042182147502899, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1926 + }, + { + "epoch": 1.5391373801916934, + "grad_norm": 0.08777181059122086, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1927 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 0.18283812701702118, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 1928 + }, + { + "epoch": 1.54073482428115, + "grad_norm": 0.11731691658496857, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 1929 + }, + { + "epoch": 1.5415335463258786, + "grad_norm": 0.04163304716348648, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 1930 + }, + { + "epoch": 1.542332268370607, + "grad_norm": 0.12119868397712708, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 1931 + }, + { + "epoch": 1.5431309904153354, + "grad_norm": 0.18475785851478577, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1932 + }, + { + "epoch": 1.543929712460064, + "grad_norm": 0.16582897305488586, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 1933 + }, + { + "epoch": 1.5447284345047922, + "grad_norm": 0.086383156478405, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1934 + }, + { + "epoch": 1.5455271565495208, + "grad_norm": 0.047143738716840744, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 1935 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 0.0830119326710701, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1936 + }, + { + "epoch": 1.5471246006389776, + "grad_norm": 0.14226214587688446, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 1937 + }, + { + "epoch": 1.547923322683706, + "grad_norm": 0.1719929724931717, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1938 + }, + { + "epoch": 1.5487220447284344, + "grad_norm": 0.18388192355632782, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 1939 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 0.16870245337486267, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 1940 + }, + { + "epoch": 1.5503194888178915, + "grad_norm": 0.1100412905216217, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 1941 + }, + { + "epoch": 1.5511182108626198, + "grad_norm": 0.05124165490269661, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1942 + }, + { + "epoch": 1.5519169329073481, + "grad_norm": 0.08937443792819977, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 1943 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 0.13589949905872345, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 1944 + }, + { + "epoch": 1.5535143769968052, + "grad_norm": 0.12346407026052475, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1945 + }, + { + "epoch": 1.5543130990415337, + "grad_norm": 0.11836438626050949, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 1946 + }, + { + "epoch": 1.555111821086262, + "grad_norm": 0.07569031417369843, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 1947 + }, + { + "epoch": 1.5559105431309903, + "grad_norm": 0.039178211241960526, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1948 + }, + { + "epoch": 1.5567092651757188, + "grad_norm": 0.0431843139231205, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1949 + }, + { + "epoch": 1.5575079872204474, + "grad_norm": 0.06331207603216171, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 1950 + }, + { + "epoch": 1.5583067092651757, + "grad_norm": 0.0670275092124939, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 1951 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 0.04372883588075638, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 1952 + }, + { + "epoch": 1.5599041533546325, + "grad_norm": 0.15768256783485413, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 1953 + }, + { + "epoch": 1.560702875399361, + "grad_norm": 0.30828192830085754, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 1954 + }, + { + "epoch": 1.5615015974440896, + "grad_norm": 0.3741140365600586, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 1955 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 0.25689223408699036, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1956 + }, + { + "epoch": 1.5630990415335462, + "grad_norm": 0.0691552683711052, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1957 + }, + { + "epoch": 1.5638977635782747, + "grad_norm": 0.2742094099521637, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 1958 + }, + { + "epoch": 1.5646964856230032, + "grad_norm": 0.2760325074195862, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 1959 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 0.09094057232141495, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 1960 + }, + { + "epoch": 1.56629392971246, + "grad_norm": 0.11926092952489853, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1961 + }, + { + "epoch": 1.5670926517571884, + "grad_norm": 0.18398839235305786, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 1962 + }, + { + "epoch": 1.567891373801917, + "grad_norm": 0.17090962827205658, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 1963 + }, + { + "epoch": 1.5686900958466454, + "grad_norm": 0.07806222885847092, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 1964 + }, + { + "epoch": 1.569488817891374, + "grad_norm": 0.17260140180587769, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 1965 + }, + { + "epoch": 1.5702875399361023, + "grad_norm": 0.2848401665687561, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1966 + }, + { + "epoch": 1.5710862619808306, + "grad_norm": 0.19075879454612732, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1967 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 0.044234778732061386, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 1968 + }, + { + "epoch": 1.5726837060702876, + "grad_norm": 0.16188788414001465, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 1969 + }, + { + "epoch": 1.573482428115016, + "grad_norm": 0.19148766994476318, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 1970 + }, + { + "epoch": 1.5742811501597445, + "grad_norm": 0.11576604843139648, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 1971 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 0.049716517329216, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 1972 + }, + { + "epoch": 1.5758785942492013, + "grad_norm": 0.12528614699840546, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 1973 + }, + { + "epoch": 1.5766773162939298, + "grad_norm": 0.1574268341064453, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 1974 + }, + { + "epoch": 1.5774760383386581, + "grad_norm": 0.06606525182723999, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 1975 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 0.16142094135284424, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1976 + }, + { + "epoch": 1.579073482428115, + "grad_norm": 0.29769718647003174, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 1977 + }, + { + "epoch": 1.5798722044728435, + "grad_norm": 0.20111548900604248, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 1978 + }, + { + "epoch": 1.580670926517572, + "grad_norm": 0.06375493854284286, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 1979 + }, + { + "epoch": 1.5814696485623003, + "grad_norm": 0.2208068072795868, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 1980 + }, + { + "epoch": 1.5822683706070286, + "grad_norm": 0.2920839488506317, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 1981 + }, + { + "epoch": 1.5830670926517572, + "grad_norm": 0.2115958034992218, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 1982 + }, + { + "epoch": 1.5838658146964857, + "grad_norm": 0.048249468207359314, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 1983 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 0.15551301836967468, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 1984 + }, + { + "epoch": 1.5854632587859425, + "grad_norm": 0.2190883755683899, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 1985 + }, + { + "epoch": 1.5862619808306708, + "grad_norm": 0.15155111253261566, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 1986 + }, + { + "epoch": 1.5870607028753994, + "grad_norm": 0.056616391986608505, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 1987 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 0.1638905555009842, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 1988 + }, + { + "epoch": 1.5886581469648562, + "grad_norm": 0.11643283069133759, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 1989 + }, + { + "epoch": 1.5894568690095847, + "grad_norm": 0.06423045694828033, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 1990 + }, + { + "epoch": 1.590255591054313, + "grad_norm": 0.11044095456600189, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 1991 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 0.11911707371473312, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 1992 + }, + { + "epoch": 1.59185303514377, + "grad_norm": 0.045604925602674484, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 1993 + }, + { + "epoch": 1.5926517571884984, + "grad_norm": 0.10280558466911316, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 1994 + }, + { + "epoch": 1.5934504792332267, + "grad_norm": 0.13807371258735657, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 1995 + }, + { + "epoch": 1.5942492012779552, + "grad_norm": 0.06163270026445389, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 1996 + }, + { + "epoch": 1.5950479233226837, + "grad_norm": 0.12899963557720184, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 1997 + }, + { + "epoch": 1.5958466453674123, + "grad_norm": 0.24358411133289337, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 1998 + }, + { + "epoch": 1.5966453674121406, + "grad_norm": 0.23341934382915497, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 1999 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 0.11766334623098373, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2000 + }, + { + "epoch": 1.5982428115015974, + "grad_norm": 0.07918071746826172, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2001 + }, + { + "epoch": 1.599041533546326, + "grad_norm": 0.1473437398672104, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2002 + }, + { + "epoch": 1.5998402555910545, + "grad_norm": 0.08945708721876144, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2003 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.06553255021572113, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2004 + }, + { + "epoch": 1.601437699680511, + "grad_norm": 0.12708786129951477, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2005 + }, + { + "epoch": 1.6022364217252396, + "grad_norm": 0.16935905814170837, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2006 + }, + { + "epoch": 1.6030351437699681, + "grad_norm": 0.10428016632795334, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2007 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 0.06016766279935837, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 2008 + }, + { + "epoch": 1.604632587859425, + "grad_norm": 0.1563751995563507, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2009 + }, + { + "epoch": 1.6054313099041533, + "grad_norm": 0.1919829398393631, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2010 + }, + { + "epoch": 1.6062300319488818, + "grad_norm": 0.14739179611206055, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2011 + }, + { + "epoch": 1.6070287539936103, + "grad_norm": 0.08086550235748291, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2012 + }, + { + "epoch": 1.6078274760383386, + "grad_norm": 0.06594815105199814, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2013 + }, + { + "epoch": 1.608626198083067, + "grad_norm": 0.10502789169549942, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2014 + }, + { + "epoch": 1.6094249201277955, + "grad_norm": 0.1312190145254135, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2015 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 0.062411367893218994, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2016 + }, + { + "epoch": 1.6110223642172525, + "grad_norm": 0.04986036196351051, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2017 + }, + { + "epoch": 1.6118210862619808, + "grad_norm": 0.08428573608398438, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2018 + }, + { + "epoch": 1.6126198083067091, + "grad_norm": 0.11552372574806213, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2019 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 0.07657046616077423, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2020 + }, + { + "epoch": 1.6142172523961662, + "grad_norm": 0.05540962517261505, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 2021 + }, + { + "epoch": 1.6150159744408947, + "grad_norm": 0.048573557287454605, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2022 + }, + { + "epoch": 1.615814696485623, + "grad_norm": 0.08630840480327606, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2023 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 0.06090754270553589, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2024 + }, + { + "epoch": 1.6174121405750799, + "grad_norm": 0.05828041955828667, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2025 + }, + { + "epoch": 1.6182108626198084, + "grad_norm": 0.12483426928520203, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2026 + }, + { + "epoch": 1.6190095846645367, + "grad_norm": 0.13772840797901154, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2027 + }, + { + "epoch": 1.619808306709265, + "grad_norm": 0.08477568626403809, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2028 + }, + { + "epoch": 1.6206070287539935, + "grad_norm": 0.037577688694000244, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2029 + }, + { + "epoch": 1.621405750798722, + "grad_norm": 0.07961893081665039, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2030 + }, + { + "epoch": 1.6222044728434506, + "grad_norm": 0.06744182854890823, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2031 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 0.06228869408369064, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2032 + }, + { + "epoch": 1.6238019169329072, + "grad_norm": 0.1972920298576355, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2033 + }, + { + "epoch": 1.6246006389776357, + "grad_norm": 0.2701529562473297, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2034 + }, + { + "epoch": 1.6253993610223643, + "grad_norm": 0.20371970534324646, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2035 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 0.08887646347284317, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2036 + }, + { + "epoch": 1.626996805111821, + "grad_norm": 0.06480003893375397, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2037 + }, + { + "epoch": 1.6277955271565494, + "grad_norm": 0.089780792593956, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2038 + }, + { + "epoch": 1.628594249201278, + "grad_norm": 0.04014933854341507, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2039 + }, + { + "epoch": 1.6293929712460065, + "grad_norm": 0.0993470847606659, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 2040 + }, + { + "epoch": 1.630191693290735, + "grad_norm": 0.1957429200410843, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2041 + }, + { + "epoch": 1.6309904153354633, + "grad_norm": 0.2273249477148056, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2042 + }, + { + "epoch": 1.6317891373801916, + "grad_norm": 0.1936638057231903, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2043 + }, + { + "epoch": 1.6325878594249201, + "grad_norm": 0.10150687396526337, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2044 + }, + { + "epoch": 1.6333865814696487, + "grad_norm": 0.051224563270807266, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2045 + }, + { + "epoch": 1.634185303514377, + "grad_norm": 0.13044138252735138, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2046 + }, + { + "epoch": 1.6349840255591053, + "grad_norm": 0.16140064597129822, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2047 + }, + { + "epoch": 1.6357827476038338, + "grad_norm": 0.13187173008918762, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2048 + }, + { + "epoch": 1.6365814696485623, + "grad_norm": 0.03873397782444954, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2049 + }, + { + "epoch": 1.6373801916932909, + "grad_norm": 0.0575883649289608, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2050 + }, + { + "epoch": 1.6381789137380192, + "grad_norm": 0.039476748555898666, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 2051 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 0.06802869588136673, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2052 + }, + { + "epoch": 1.639776357827476, + "grad_norm": 0.059946198016405106, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2053 + }, + { + "epoch": 1.6405750798722045, + "grad_norm": 0.05185665935277939, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2054 + }, + { + "epoch": 1.641373801916933, + "grad_norm": 0.08230192214250565, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2055 + }, + { + "epoch": 1.6421725239616614, + "grad_norm": 0.10175196081399918, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2056 + }, + { + "epoch": 1.6429712460063897, + "grad_norm": 0.07616171985864639, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2057 + }, + { + "epoch": 1.6437699680511182, + "grad_norm": 0.4597811698913574, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2058 + }, + { + "epoch": 1.6445686900958467, + "grad_norm": 0.12450811266899109, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2059 + }, + { + "epoch": 1.645367412140575, + "grad_norm": 0.10847678035497665, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2060 + }, + { + "epoch": 1.6461661341853036, + "grad_norm": 0.05778864026069641, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2061 + }, + { + "epoch": 1.6469648562300319, + "grad_norm": 0.04321129992604256, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2062 + }, + { + "epoch": 1.6477635782747604, + "grad_norm": 0.05467045307159424, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2063 + }, + { + "epoch": 1.648562300319489, + "grad_norm": 0.044298864901065826, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2064 + }, + { + "epoch": 1.6493610223642172, + "grad_norm": 0.03863062337040901, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2065 + }, + { + "epoch": 1.6501597444089455, + "grad_norm": 0.04040979593992233, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2066 + }, + { + "epoch": 1.650958466453674, + "grad_norm": 0.03647322207689285, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2067 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 0.049459293484687805, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2068 + }, + { + "epoch": 1.6525559105431311, + "grad_norm": 0.052851296961307526, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2069 + }, + { + "epoch": 1.6533546325878594, + "grad_norm": 0.10360822081565857, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2070 + }, + { + "epoch": 1.6541533546325877, + "grad_norm": 0.18817105889320374, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2071 + }, + { + "epoch": 1.6549520766773163, + "grad_norm": 0.1711605340242386, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2072 + }, + { + "epoch": 1.6557507987220448, + "grad_norm": 0.08807278424501419, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2073 + }, + { + "epoch": 1.6565495207667733, + "grad_norm": 0.0631125420331955, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2074 + }, + { + "epoch": 1.6573482428115016, + "grad_norm": 0.17277394235134125, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2075 + }, + { + "epoch": 1.65814696485623, + "grad_norm": 0.2353454977273941, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2076 + }, + { + "epoch": 1.6589456869009584, + "grad_norm": 0.18835891783237457, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2077 + }, + { + "epoch": 1.659744408945687, + "grad_norm": 0.08717352151870728, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2078 + }, + { + "epoch": 1.6605431309904153, + "grad_norm": 0.05640486627817154, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2079 + }, + { + "epoch": 1.6613418530351438, + "grad_norm": 0.11206189543008804, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2080 + }, + { + "epoch": 1.6621405750798721, + "grad_norm": 0.10098055750131607, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2081 + }, + { + "epoch": 1.6629392971246006, + "grad_norm": 0.04627184569835663, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2082 + }, + { + "epoch": 1.6637380191693292, + "grad_norm": 0.13048212230205536, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2083 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 0.22329512238502502, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2084 + }, + { + "epoch": 1.6653354632587858, + "grad_norm": 0.23544666171073914, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2085 + }, + { + "epoch": 1.6661341853035143, + "grad_norm": 0.1329459846019745, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2086 + }, + { + "epoch": 1.6669329073482428, + "grad_norm": 0.07398947328329086, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2087 + }, + { + "epoch": 1.6677316293929714, + "grad_norm": 0.1926809549331665, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2088 + }, + { + "epoch": 1.6685303514376997, + "grad_norm": 0.19097647070884705, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 2089 + }, + { + "epoch": 1.669329073482428, + "grad_norm": 0.10474745184183121, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2090 + }, + { + "epoch": 1.6701277955271565, + "grad_norm": 0.04437112435698509, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2091 + }, + { + "epoch": 1.670926517571885, + "grad_norm": 0.13698135316371918, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2092 + }, + { + "epoch": 1.6717252396166136, + "grad_norm": 0.14437462389469147, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2093 + }, + { + "epoch": 1.6725239616613419, + "grad_norm": 0.0938732922077179, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2094 + }, + { + "epoch": 1.6733226837060702, + "grad_norm": 0.060729511082172394, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2095 + }, + { + "epoch": 1.6741214057507987, + "grad_norm": 0.05354619398713112, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2096 + }, + { + "epoch": 1.6749201277955272, + "grad_norm": 0.056909799575805664, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2097 + }, + { + "epoch": 1.6757188498402555, + "grad_norm": 0.09815286099910736, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2098 + }, + { + "epoch": 1.676517571884984, + "grad_norm": 0.1432102620601654, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2099 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 0.14039601385593414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2100 + }, + { + "epoch": 1.678115015974441, + "grad_norm": 0.06634008139371872, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2101 + }, + { + "epoch": 1.6789137380191694, + "grad_norm": 0.1347021609544754, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2102 + }, + { + "epoch": 1.6797124600638977, + "grad_norm": 0.24721868336200714, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2103 + }, + { + "epoch": 1.680511182108626, + "grad_norm": 0.23194770514965057, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2104 + }, + { + "epoch": 1.6813099041533546, + "grad_norm": 0.12276436388492584, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2105 + }, + { + "epoch": 1.682108626198083, + "grad_norm": 0.06224825233221054, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2106 + }, + { + "epoch": 1.6829073482428116, + "grad_norm": 0.20683766901493073, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2107 + }, + { + "epoch": 1.68370607028754, + "grad_norm": 0.26914462447166443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2108 + }, + { + "epoch": 1.6845047923322682, + "grad_norm": 0.20070654153823853, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2109 + }, + { + "epoch": 1.6853035143769968, + "grad_norm": 0.08465532958507538, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2110 + }, + { + "epoch": 1.6861022364217253, + "grad_norm": 0.10843367129564285, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2111 + }, + { + "epoch": 1.6869009584664538, + "grad_norm": 0.20252646505832672, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2112 + }, + { + "epoch": 1.6876996805111821, + "grad_norm": 0.11803672462701797, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2113 + }, + { + "epoch": 1.6884984025559104, + "grad_norm": 0.08800901472568512, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2114 + }, + { + "epoch": 1.689297124600639, + "grad_norm": 0.23917800188064575, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2115 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 0.21528035402297974, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2116 + }, + { + "epoch": 1.6908945686900958, + "grad_norm": 0.05292942747473717, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2117 + }, + { + "epoch": 1.6916932907348243, + "grad_norm": 0.12942583858966827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2118 + }, + { + "epoch": 1.6924920127795526, + "grad_norm": 0.19304881989955902, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2119 + }, + { + "epoch": 1.6932907348242812, + "grad_norm": 0.10951094329357147, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 2120 + }, + { + "epoch": 1.6940894568690097, + "grad_norm": 0.07684643566608429, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2121 + }, + { + "epoch": 1.694888178913738, + "grad_norm": 0.14990608394145966, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2122 + }, + { + "epoch": 1.6956869009584663, + "grad_norm": 0.1104716882109642, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2123 + }, + { + "epoch": 1.6964856230031948, + "grad_norm": 0.06538088619709015, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2124 + }, + { + "epoch": 1.6972843450479234, + "grad_norm": 0.05474448576569557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2125 + }, + { + "epoch": 1.6980830670926519, + "grad_norm": 0.0803864449262619, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2126 + }, + { + "epoch": 1.6988817891373802, + "grad_norm": 0.04384651407599449, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2127 + }, + { + "epoch": 1.6996805111821085, + "grad_norm": 0.07006746530532837, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 2128 + }, + { + "epoch": 1.700479233226837, + "grad_norm": 0.08840122073888779, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2129 + }, + { + "epoch": 1.7012779552715656, + "grad_norm": 0.06421404331922531, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2130 + }, + { + "epoch": 1.702076677316294, + "grad_norm": 0.03711751103401184, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2131 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 0.06725160032510757, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2132 + }, + { + "epoch": 1.7036741214057507, + "grad_norm": 0.0517839640378952, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2133 + }, + { + "epoch": 1.7044728434504792, + "grad_norm": 0.046399205923080444, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2134 + }, + { + "epoch": 1.7052715654952078, + "grad_norm": 0.05188435688614845, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2135 + }, + { + "epoch": 1.706070287539936, + "grad_norm": 0.08578629791736603, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2136 + }, + { + "epoch": 1.7068690095846646, + "grad_norm": 0.07895999401807785, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2137 + }, + { + "epoch": 1.707667731629393, + "grad_norm": 0.060662928968667984, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2138 + }, + { + "epoch": 1.7084664536741214, + "grad_norm": 0.08372191339731216, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2139 + }, + { + "epoch": 1.70926517571885, + "grad_norm": 0.1217966303229332, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2140 + }, + { + "epoch": 1.7100638977635783, + "grad_norm": 0.14054186642169952, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2141 + }, + { + "epoch": 1.7108626198083066, + "grad_norm": 0.11693520098924637, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2142 + }, + { + "epoch": 1.711661341853035, + "grad_norm": 0.04271163418889046, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2143 + }, + { + "epoch": 1.7124600638977636, + "grad_norm": 0.11898874491453171, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2144 + }, + { + "epoch": 1.7132587859424921, + "grad_norm": 0.2637499272823334, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2145 + }, + { + "epoch": 1.7140575079872205, + "grad_norm": 0.29218390583992004, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2146 + }, + { + "epoch": 1.7148562300319488, + "grad_norm": 0.1899375170469284, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2147 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 0.04336607828736305, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2148 + }, + { + "epoch": 1.7164536741214058, + "grad_norm": 0.14123578369617462, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2149 + }, + { + "epoch": 1.7172523961661343, + "grad_norm": 0.19930055737495422, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2150 + }, + { + "epoch": 1.7180511182108626, + "grad_norm": 0.1796298772096634, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2151 + }, + { + "epoch": 1.718849840255591, + "grad_norm": 0.07607068121433258, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2152 + }, + { + "epoch": 1.7196485623003195, + "grad_norm": 0.12980210781097412, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2153 + }, + { + "epoch": 1.720447284345048, + "grad_norm": 0.2507205009460449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2154 + }, + { + "epoch": 1.7212460063897763, + "grad_norm": 0.2388920783996582, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2155 + }, + { + "epoch": 1.7220447284345048, + "grad_norm": 0.13363847136497498, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2156 + }, + { + "epoch": 1.7228434504792332, + "grad_norm": 0.048030026257038116, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 2157 + }, + { + "epoch": 1.7236421725239617, + "grad_norm": 0.14619708061218262, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2158 + }, + { + "epoch": 1.7244408945686902, + "grad_norm": 0.22031216323375702, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2159 + }, + { + "epoch": 1.7252396166134185, + "grad_norm": 0.18440701067447662, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2160 + }, + { + "epoch": 1.7260383386581468, + "grad_norm": 0.08183866739273071, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2161 + }, + { + "epoch": 1.7268370607028753, + "grad_norm": 0.05314984545111656, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2162 + }, + { + "epoch": 1.7276357827476039, + "grad_norm": 0.1438753753900528, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2163 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 0.0881122425198555, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 2164 + }, + { + "epoch": 1.7292332268370607, + "grad_norm": 0.1165589690208435, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2165 + }, + { + "epoch": 1.730031948881789, + "grad_norm": 0.14884884655475616, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2166 + }, + { + "epoch": 1.7308306709265175, + "grad_norm": 0.10219287127256393, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2167 + }, + { + "epoch": 1.731629392971246, + "grad_norm": 0.059794824570417404, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2168 + }, + { + "epoch": 1.7324281150159746, + "grad_norm": 0.0538945347070694, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2169 + }, + { + "epoch": 1.733226837060703, + "grad_norm": 0.1016303226351738, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2170 + }, + { + "epoch": 1.7340255591054312, + "grad_norm": 0.058912694454193115, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2171 + }, + { + "epoch": 1.7348242811501597, + "grad_norm": 0.060018621385097504, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2172 + }, + { + "epoch": 1.7356230031948883, + "grad_norm": 0.05386706069111824, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2173 + }, + { + "epoch": 1.7364217252396166, + "grad_norm": 0.06266453117132187, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2174 + }, + { + "epoch": 1.7372204472843449, + "grad_norm": 0.1035243570804596, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 2175 + }, + { + "epoch": 1.7380191693290734, + "grad_norm": 0.17216888070106506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2176 + }, + { + "epoch": 1.738817891373802, + "grad_norm": 0.23428532481193542, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2177 + }, + { + "epoch": 1.7396166134185305, + "grad_norm": 0.21038073301315308, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2178 + }, + { + "epoch": 1.7404153354632588, + "grad_norm": 0.1487000286579132, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2179 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 0.03916196525096893, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2180 + }, + { + "epoch": 1.7420127795527156, + "grad_norm": 0.13702991604804993, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2181 + }, + { + "epoch": 1.7428115015974441, + "grad_norm": 0.21363528072834015, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2182 + }, + { + "epoch": 1.7436102236421727, + "grad_norm": 0.134271502494812, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2183 + }, + { + "epoch": 1.744408945686901, + "grad_norm": 0.062452565878629684, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2184 + }, + { + "epoch": 1.7452076677316293, + "grad_norm": 0.1745995730161667, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2185 + }, + { + "epoch": 1.7460063897763578, + "grad_norm": 0.19709894061088562, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2186 + }, + { + "epoch": 1.7468051118210863, + "grad_norm": 0.1201571598649025, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2187 + }, + { + "epoch": 1.7476038338658149, + "grad_norm": 0.03690087050199509, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2188 + }, + { + "epoch": 1.7484025559105432, + "grad_norm": 0.1387440711259842, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2189 + }, + { + "epoch": 1.7492012779552715, + "grad_norm": 0.2084781676530838, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2190 + }, + { + "epoch": 1.75, + "grad_norm": 0.17941167950630188, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2191 + }, + { + "epoch": 1.7507987220447285, + "grad_norm": 0.09751889854669571, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2192 + }, + { + "epoch": 1.7515974440894568, + "grad_norm": 0.04116421565413475, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2193 + }, + { + "epoch": 1.7523961661341851, + "grad_norm": 0.14683429896831512, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2194 + }, + { + "epoch": 1.7531948881789137, + "grad_norm": 0.19602352380752563, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2195 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 0.18503598868846893, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2196 + }, + { + "epoch": 1.7547923322683707, + "grad_norm": 0.09473808109760284, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2197 + }, + { + "epoch": 1.755591054313099, + "grad_norm": 0.05645129457116127, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2198 + }, + { + "epoch": 1.7563897763578273, + "grad_norm": 0.09260818362236023, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2199 + }, + { + "epoch": 1.7571884984025559, + "grad_norm": 0.045891985297203064, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2200 + }, + { + "epoch": 1.7579872204472844, + "grad_norm": 0.125623419880867, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2201 + }, + { + "epoch": 1.758785942492013, + "grad_norm": 0.18919512629508972, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2202 + }, + { + "epoch": 1.7595846645367412, + "grad_norm": 0.17549264430999756, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2203 + }, + { + "epoch": 1.7603833865814695, + "grad_norm": 0.047342319041490555, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 2204 + }, + { + "epoch": 1.761182108626198, + "grad_norm": 0.177268847823143, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2205 + }, + { + "epoch": 1.7619808306709266, + "grad_norm": 0.28258222341537476, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2206 + }, + { + "epoch": 1.7627795527156551, + "grad_norm": 0.25111353397369385, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2207 + }, + { + "epoch": 1.7635782747603834, + "grad_norm": 0.11864849925041199, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2208 + }, + { + "epoch": 1.7643769968051117, + "grad_norm": 0.06387785822153091, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2209 + }, + { + "epoch": 1.7651757188498403, + "grad_norm": 0.1264238804578781, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2210 + }, + { + "epoch": 1.7659744408945688, + "grad_norm": 0.12080882489681244, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2211 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.05618004873394966, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2212 + }, + { + "epoch": 1.7675718849840254, + "grad_norm": 0.06543037295341492, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2213 + }, + { + "epoch": 1.768370607028754, + "grad_norm": 0.08525256812572479, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2214 + }, + { + "epoch": 1.7691693290734825, + "grad_norm": 0.08571972697973251, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2215 + }, + { + "epoch": 1.769968051118211, + "grad_norm": 0.04897582530975342, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2216 + }, + { + "epoch": 1.7707667731629393, + "grad_norm": 0.07296427339315414, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2217 + }, + { + "epoch": 1.7715654952076676, + "grad_norm": 0.041904110461473465, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2218 + }, + { + "epoch": 1.7723642172523961, + "grad_norm": 0.053191233426332474, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2219 + }, + { + "epoch": 1.7731629392971247, + "grad_norm": 0.056369587779045105, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2220 + }, + { + "epoch": 1.7739616613418532, + "grad_norm": 0.06455157697200775, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2221 + }, + { + "epoch": 1.7747603833865815, + "grad_norm": 0.06467561423778534, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2222 + }, + { + "epoch": 1.7755591054313098, + "grad_norm": 0.07162238657474518, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2223 + }, + { + "epoch": 1.7763578274760383, + "grad_norm": 0.045193906873464584, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2224 + }, + { + "epoch": 1.7771565495207668, + "grad_norm": 0.07172992080450058, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2225 + }, + { + "epoch": 1.7779552715654952, + "grad_norm": 0.07163143157958984, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2226 + }, + { + "epoch": 1.7787539936102237, + "grad_norm": 0.11480346322059631, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2227 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 0.21525998413562775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 2228 + }, + { + "epoch": 1.7803514376996805, + "grad_norm": 0.20769886672496796, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2229 + }, + { + "epoch": 1.781150159744409, + "grad_norm": 0.13149204850196838, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2230 + }, + { + "epoch": 1.7819488817891374, + "grad_norm": 0.06223989278078079, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2231 + }, + { + "epoch": 1.7827476038338657, + "grad_norm": 0.11386150866746902, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2232 + }, + { + "epoch": 1.7835463258785942, + "grad_norm": 0.1448865532875061, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2233 + }, + { + "epoch": 1.7843450479233227, + "grad_norm": 0.11244893074035645, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2234 + }, + { + "epoch": 1.7851437699680512, + "grad_norm": 0.06307587027549744, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2235 + }, + { + "epoch": 1.7859424920127795, + "grad_norm": 0.1529018133878708, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2236 + }, + { + "epoch": 1.7867412140575079, + "grad_norm": 0.212649405002594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2237 + }, + { + "epoch": 1.7875399361022364, + "grad_norm": 0.18361856043338776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2238 + }, + { + "epoch": 1.788338658146965, + "grad_norm": 0.06960433721542358, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2239 + }, + { + "epoch": 1.7891373801916934, + "grad_norm": 0.13445821404457092, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2240 + }, + { + "epoch": 1.7899361022364217, + "grad_norm": 0.24758578836917877, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 2241 + }, + { + "epoch": 1.79073482428115, + "grad_norm": 0.27208608388900757, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2242 + }, + { + "epoch": 1.7915335463258786, + "grad_norm": 0.1256505697965622, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2243 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 0.12209334224462509, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2244 + }, + { + "epoch": 1.7931309904153354, + "grad_norm": 0.2690032720565796, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2245 + }, + { + "epoch": 1.793929712460064, + "grad_norm": 0.27393221855163574, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2246 + }, + { + "epoch": 1.7947284345047922, + "grad_norm": 0.12508991360664368, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 2247 + }, + { + "epoch": 1.7955271565495208, + "grad_norm": 0.10001108795404434, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2248 + }, + { + "epoch": 1.7963258785942493, + "grad_norm": 0.2588697373867035, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2249 + }, + { + "epoch": 1.7971246006389776, + "grad_norm": 0.24723860621452332, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2250 + }, + { + "epoch": 1.797923322683706, + "grad_norm": 0.09018664062023163, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2251 + }, + { + "epoch": 1.7987220447284344, + "grad_norm": 0.09745316952466965, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 2252 + }, + { + "epoch": 1.799520766773163, + "grad_norm": 0.20877481997013092, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 2253 + }, + { + "epoch": 1.8003194888178915, + "grad_norm": 0.24291004240512848, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2254 + }, + { + "epoch": 1.8011182108626198, + "grad_norm": 0.1967754364013672, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2255 + }, + { + "epoch": 1.8019169329073481, + "grad_norm": 0.088215172290802, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2256 + }, + { + "epoch": 1.8027156549520766, + "grad_norm": 0.07018816471099854, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2257 + }, + { + "epoch": 1.8035143769968052, + "grad_norm": 0.17161858081817627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2258 + }, + { + "epoch": 1.8043130990415337, + "grad_norm": 0.22007174789905548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2259 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 0.16093726456165314, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2260 + }, + { + "epoch": 1.8059105431309903, + "grad_norm": 0.06763539463281631, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2261 + }, + { + "epoch": 1.8067092651757188, + "grad_norm": 0.1066257432103157, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2262 + }, + { + "epoch": 1.8075079872204474, + "grad_norm": 0.17658250033855438, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2263 + }, + { + "epoch": 1.8083067092651757, + "grad_norm": 0.21157506108283997, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2264 + }, + { + "epoch": 1.8091054313099042, + "grad_norm": 0.16717523336410522, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2265 + }, + { + "epoch": 1.8099041533546325, + "grad_norm": 0.08356527984142303, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2266 + }, + { + "epoch": 1.810702875399361, + "grad_norm": 0.11939100921154022, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2267 + }, + { + "epoch": 1.8115015974440896, + "grad_norm": 0.2322039157152176, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2268 + }, + { + "epoch": 1.8123003194888179, + "grad_norm": 0.2277170568704605, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2269 + }, + { + "epoch": 1.8130990415335462, + "grad_norm": 0.06634530425071716, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2270 + }, + { + "epoch": 1.8138977635782747, + "grad_norm": 0.20808424055576324, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2271 + }, + { + "epoch": 1.8146964856230032, + "grad_norm": 0.3761717975139618, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2272 + }, + { + "epoch": 1.8154952076677318, + "grad_norm": 0.3587193191051483, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2273 + }, + { + "epoch": 1.81629392971246, + "grad_norm": 0.12116564810276031, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2274 + }, + { + "epoch": 1.8170926517571884, + "grad_norm": 0.20137764513492584, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2275 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 0.30456987023353577, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2276 + }, + { + "epoch": 1.8186900958466454, + "grad_norm": 0.15625369548797607, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2277 + }, + { + "epoch": 1.819488817891374, + "grad_norm": 0.12682494521141052, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2278 + }, + { + "epoch": 1.8202875399361023, + "grad_norm": 0.26252153515815735, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2279 + }, + { + "epoch": 1.8210862619808306, + "grad_norm": 0.17610949277877808, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2280 + }, + { + "epoch": 1.821884984025559, + "grad_norm": 0.056205663830041885, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2281 + }, + { + "epoch": 1.8226837060702876, + "grad_norm": 0.1519095003604889, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2282 + }, + { + "epoch": 1.823482428115016, + "grad_norm": 0.1591203212738037, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2283 + }, + { + "epoch": 1.8242811501597445, + "grad_norm": 0.11261039227247238, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2284 + }, + { + "epoch": 1.8250798722044728, + "grad_norm": 0.06855058670043945, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2285 + }, + { + "epoch": 1.8258785942492013, + "grad_norm": 0.04728224128484726, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2286 + }, + { + "epoch": 1.8266773162939298, + "grad_norm": 0.0677042305469513, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2287 + }, + { + "epoch": 1.8274760383386581, + "grad_norm": 0.0836048573255539, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2288 + }, + { + "epoch": 1.8282747603833864, + "grad_norm": 0.0657985508441925, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2289 + }, + { + "epoch": 1.829073482428115, + "grad_norm": 0.05567999184131622, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2290 + }, + { + "epoch": 1.8298722044728435, + "grad_norm": 0.13710817694664001, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2291 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 0.14417411386966705, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2292 + }, + { + "epoch": 1.8314696485623003, + "grad_norm": 0.12273317575454712, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2293 + }, + { + "epoch": 1.8322683706070286, + "grad_norm": 0.12350328266620636, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2294 + }, + { + "epoch": 1.8330670926517572, + "grad_norm": 0.12832887470722198, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2295 + }, + { + "epoch": 1.8338658146964857, + "grad_norm": 0.17759868502616882, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2296 + }, + { + "epoch": 1.8346645367412142, + "grad_norm": 0.18485887348651886, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2297 + }, + { + "epoch": 1.8354632587859425, + "grad_norm": 0.11906488239765167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2298 + }, + { + "epoch": 1.8362619808306708, + "grad_norm": 0.04088319092988968, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2299 + }, + { + "epoch": 1.8370607028753994, + "grad_norm": 0.18988807499408722, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2300 + }, + { + "epoch": 1.8378594249201279, + "grad_norm": 0.2758033275604248, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2301 + }, + { + "epoch": 1.8386581469648562, + "grad_norm": 0.26860401034355164, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2302 + }, + { + "epoch": 1.8394568690095847, + "grad_norm": 0.1770019680261612, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2303 + }, + { + "epoch": 1.840255591054313, + "grad_norm": 0.03740993142127991, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2304 + }, + { + "epoch": 1.8410543130990416, + "grad_norm": 0.13697518408298492, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2305 + }, + { + "epoch": 1.84185303514377, + "grad_norm": 0.15273790061473846, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2306 + }, + { + "epoch": 1.8426517571884984, + "grad_norm": 0.08181154727935791, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2307 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.05599624291062355, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2308 + }, + { + "epoch": 1.8442492012779552, + "grad_norm": 0.17429251968860626, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2309 + }, + { + "epoch": 1.8450479233226837, + "grad_norm": 0.20159491896629333, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2310 + }, + { + "epoch": 1.8458466453674123, + "grad_norm": 0.10825419425964355, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2311 + }, + { + "epoch": 1.8466453674121406, + "grad_norm": 0.0784185528755188, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2312 + }, + { + "epoch": 1.8474440894568689, + "grad_norm": 0.15851987898349762, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2313 + }, + { + "epoch": 1.8482428115015974, + "grad_norm": 0.11244971305131912, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2314 + }, + { + "epoch": 1.849041533546326, + "grad_norm": 0.04119047150015831, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2315 + }, + { + "epoch": 1.8498402555910545, + "grad_norm": 0.12872102856636047, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2316 + }, + { + "epoch": 1.8506389776357828, + "grad_norm": 0.1542259305715561, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2317 + }, + { + "epoch": 1.851437699680511, + "grad_norm": 0.09662868827581406, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2318 + }, + { + "epoch": 1.8522364217252396, + "grad_norm": 0.04452383890748024, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 2319 + }, + { + "epoch": 1.8530351437699681, + "grad_norm": 0.03368959203362465, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2320 + }, + { + "epoch": 1.8538338658146964, + "grad_norm": 0.05867767333984375, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2321 + }, + { + "epoch": 1.854632587859425, + "grad_norm": 0.0774846225976944, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2322 + }, + { + "epoch": 1.8554313099041533, + "grad_norm": 0.05172058939933777, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2323 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 0.06597824394702911, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2324 + }, + { + "epoch": 1.8570287539936103, + "grad_norm": 0.10818778723478317, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2325 + }, + { + "epoch": 1.8578274760383386, + "grad_norm": 0.12698976695537567, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2326 + }, + { + "epoch": 1.858626198083067, + "grad_norm": 0.06547659635543823, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2327 + }, + { + "epoch": 1.8594249201277955, + "grad_norm": 0.08613643050193787, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2328 + }, + { + "epoch": 1.860223642172524, + "grad_norm": 0.23452800512313843, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2329 + }, + { + "epoch": 1.8610223642172525, + "grad_norm": 0.29293227195739746, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2330 + }, + { + "epoch": 1.8618210862619808, + "grad_norm": 0.17590634524822235, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2331 + }, + { + "epoch": 1.8626198083067091, + "grad_norm": 0.09830035269260406, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2332 + }, + { + "epoch": 1.8634185303514377, + "grad_norm": 0.2336016595363617, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2333 + }, + { + "epoch": 1.8642172523961662, + "grad_norm": 0.22990736365318298, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2334 + }, + { + "epoch": 1.8650159744408947, + "grad_norm": 0.14177313446998596, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2335 + }, + { + "epoch": 1.865814696485623, + "grad_norm": 0.07447824627161026, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2336 + }, + { + "epoch": 1.8666134185303513, + "grad_norm": 0.20551882684230804, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2337 + }, + { + "epoch": 1.8674121405750799, + "grad_norm": 0.21193428337574005, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2338 + }, + { + "epoch": 1.8682108626198084, + "grad_norm": 0.09889520704746246, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2339 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 0.06506047397851944, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2340 + }, + { + "epoch": 1.869808306709265, + "grad_norm": 0.10613662004470825, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2341 + }, + { + "epoch": 1.8706070287539935, + "grad_norm": 0.13049691915512085, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2342 + }, + { + "epoch": 1.871405750798722, + "grad_norm": 0.07257628440856934, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2343 + }, + { + "epoch": 1.8722044728434506, + "grad_norm": 0.05402761325240135, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2344 + }, + { + "epoch": 1.873003194888179, + "grad_norm": 0.1298513114452362, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2345 + }, + { + "epoch": 1.8738019169329072, + "grad_norm": 0.18854250013828278, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2346 + }, + { + "epoch": 1.8746006389776357, + "grad_norm": 0.18749283254146576, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2347 + }, + { + "epoch": 1.8753993610223643, + "grad_norm": 0.0791897177696228, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2348 + }, + { + "epoch": 1.8761980830670928, + "grad_norm": 0.061554014682769775, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2349 + }, + { + "epoch": 1.876996805111821, + "grad_norm": 0.07776489108800888, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2350 + }, + { + "epoch": 1.8777955271565494, + "grad_norm": 0.06406589597463608, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2351 + }, + { + "epoch": 1.878594249201278, + "grad_norm": 0.04364178702235222, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2352 + }, + { + "epoch": 1.8793929712460065, + "grad_norm": 0.14296351373195648, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2353 + }, + { + "epoch": 1.880191693290735, + "grad_norm": 0.23554368317127228, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2354 + }, + { + "epoch": 1.8809904153354633, + "grad_norm": 0.17022013664245605, + "learning_rate": 0.0005, + "loss": 1.0315, + "step": 2355 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 0.055340252816677094, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2356 + }, + { + "epoch": 1.8825878594249201, + "grad_norm": 0.10552496463060379, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2357 + }, + { + "epoch": 1.8833865814696487, + "grad_norm": 0.1601826697587967, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2358 + }, + { + "epoch": 1.884185303514377, + "grad_norm": 0.15029270946979523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2359 + }, + { + "epoch": 1.8849840255591053, + "grad_norm": 0.05186127871274948, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2360 + }, + { + "epoch": 1.8857827476038338, + "grad_norm": 0.10678224265575409, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2361 + }, + { + "epoch": 1.8865814696485623, + "grad_norm": 0.1380450427532196, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2362 + }, + { + "epoch": 1.8873801916932909, + "grad_norm": 0.08721969276666641, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2363 + }, + { + "epoch": 1.8881789137380192, + "grad_norm": 0.09425338357686996, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2364 + }, + { + "epoch": 1.8889776357827475, + "grad_norm": 0.16815589368343353, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2365 + }, + { + "epoch": 1.889776357827476, + "grad_norm": 0.16181580722332, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2366 + }, + { + "epoch": 1.8905750798722045, + "grad_norm": 0.054028045386075974, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2367 + }, + { + "epoch": 1.891373801916933, + "grad_norm": 0.07199764251708984, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2368 + }, + { + "epoch": 1.8921725239616614, + "grad_norm": 0.08493109047412872, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2369 + }, + { + "epoch": 1.8929712460063897, + "grad_norm": 0.09665308892726898, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 2370 + }, + { + "epoch": 1.8937699680511182, + "grad_norm": 0.07975895702838898, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2371 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 0.06089888513088226, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2372 + }, + { + "epoch": 1.895367412140575, + "grad_norm": 0.04610683396458626, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2373 + }, + { + "epoch": 1.8961661341853036, + "grad_norm": 0.06083180755376816, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2374 + }, + { + "epoch": 1.8969648562300319, + "grad_norm": 0.07177560776472092, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 2375 + }, + { + "epoch": 1.8977635782747604, + "grad_norm": 0.04214467853307724, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2376 + }, + { + "epoch": 1.898562300319489, + "grad_norm": 0.05166957527399063, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2377 + }, + { + "epoch": 1.8993610223642172, + "grad_norm": 0.040181614458560944, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2378 + }, + { + "epoch": 1.9001597444089455, + "grad_norm": 0.043485358357429504, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2379 + }, + { + "epoch": 1.900958466453674, + "grad_norm": 0.07395761460065842, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2380 + }, + { + "epoch": 1.9017571884984026, + "grad_norm": 0.05133877694606781, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 2381 + }, + { + "epoch": 1.9025559105431311, + "grad_norm": 0.059279292821884155, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2382 + }, + { + "epoch": 1.9033546325878594, + "grad_norm": 0.07573487609624863, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2383 + }, + { + "epoch": 1.9041533546325877, + "grad_norm": 0.07013942301273346, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2384 + }, + { + "epoch": 1.9049520766773163, + "grad_norm": 0.14524684846401215, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 2385 + }, + { + "epoch": 1.9057507987220448, + "grad_norm": 0.17374426126480103, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2386 + }, + { + "epoch": 1.9065495207667733, + "grad_norm": 0.1387263685464859, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2387 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 0.045813702046871185, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2388 + }, + { + "epoch": 1.90814696485623, + "grad_norm": 0.189321830868721, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2389 + }, + { + "epoch": 1.9089456869009584, + "grad_norm": 0.261329710483551, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2390 + }, + { + "epoch": 1.909744408945687, + "grad_norm": 0.1599399596452713, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2391 + }, + { + "epoch": 1.9105431309904153, + "grad_norm": 0.03977127745747566, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2392 + }, + { + "epoch": 1.9113418530351438, + "grad_norm": 0.16269442439079285, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2393 + }, + { + "epoch": 1.9121405750798721, + "grad_norm": 0.22963251173496246, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2394 + }, + { + "epoch": 1.9129392971246006, + "grad_norm": 0.1526031792163849, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2395 + }, + { + "epoch": 1.9137380191693292, + "grad_norm": 0.07236737757921219, + "learning_rate": 0.0005, + "loss": 1.033, + "step": 2396 + }, + { + "epoch": 1.9145367412140575, + "grad_norm": 0.19993482530117035, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2397 + }, + { + "epoch": 1.9153354632587858, + "grad_norm": 0.18950621783733368, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2398 + }, + { + "epoch": 1.9161341853035143, + "grad_norm": 0.10046153515577316, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2399 + }, + { + "epoch": 1.9169329073482428, + "grad_norm": 0.07884453237056732, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2400 + }, + { + "epoch": 1.9177316293929714, + "grad_norm": 0.23947227001190186, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2401 + }, + { + "epoch": 1.9185303514376997, + "grad_norm": 0.2662964165210724, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2402 + }, + { + "epoch": 1.919329073482428, + "grad_norm": 0.1257917582988739, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2403 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 0.09092582017183304, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2404 + }, + { + "epoch": 1.920926517571885, + "grad_norm": 0.19677215814590454, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2405 + }, + { + "epoch": 1.9217252396166136, + "grad_norm": 0.17972320318222046, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2406 + }, + { + "epoch": 1.9225239616613419, + "grad_norm": 0.06155665963888168, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2407 + }, + { + "epoch": 1.9233226837060702, + "grad_norm": 0.14805591106414795, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2408 + }, + { + "epoch": 1.9241214057507987, + "grad_norm": 0.2414662092924118, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2409 + }, + { + "epoch": 1.9249201277955272, + "grad_norm": 0.2084181308746338, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2410 + }, + { + "epoch": 1.9257188498402555, + "grad_norm": 0.05523146688938141, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2411 + }, + { + "epoch": 1.926517571884984, + "grad_norm": 0.13994552195072174, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 2412 + }, + { + "epoch": 1.9273162939297124, + "grad_norm": 0.2648966312408447, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2413 + }, + { + "epoch": 1.928115015974441, + "grad_norm": 0.28959497809410095, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2414 + }, + { + "epoch": 1.9289137380191694, + "grad_norm": 0.11457488685846329, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2415 + }, + { + "epoch": 1.9297124600638977, + "grad_norm": 0.12448041886091232, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2416 + }, + { + "epoch": 1.930511182108626, + "grad_norm": 0.20807982981204987, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2417 + }, + { + "epoch": 1.9313099041533546, + "grad_norm": 0.14537623524665833, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2418 + }, + { + "epoch": 1.932108626198083, + "grad_norm": 0.0428709015250206, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2419 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.07923824340105057, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2420 + }, + { + "epoch": 1.93370607028754, + "grad_norm": 0.06046072393655777, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2421 + }, + { + "epoch": 1.9345047923322682, + "grad_norm": 0.05921380594372749, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2422 + }, + { + "epoch": 1.9353035143769968, + "grad_norm": 0.05324951559305191, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2423 + }, + { + "epoch": 1.9361022364217253, + "grad_norm": 0.060725487768650055, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2424 + }, + { + "epoch": 1.9369009584664538, + "grad_norm": 0.09305386245250702, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2425 + }, + { + "epoch": 1.9376996805111821, + "grad_norm": 0.12314888834953308, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2426 + }, + { + "epoch": 1.9384984025559104, + "grad_norm": 0.08590805530548096, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2427 + }, + { + "epoch": 1.939297124600639, + "grad_norm": 0.07134587317705154, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2428 + }, + { + "epoch": 1.9400958466453675, + "grad_norm": 0.04584966599941254, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2429 + }, + { + "epoch": 1.9408945686900958, + "grad_norm": 0.050389841198921204, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2430 + }, + { + "epoch": 1.9416932907348243, + "grad_norm": 0.055894333869218826, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 2431 + }, + { + "epoch": 1.9424920127795526, + "grad_norm": 0.05231403559446335, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 2432 + }, + { + "epoch": 1.9432907348242812, + "grad_norm": 0.04235154017806053, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2433 + }, + { + "epoch": 1.9440894568690097, + "grad_norm": 0.038994334638118744, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2434 + }, + { + "epoch": 1.944888178913738, + "grad_norm": 0.062291134148836136, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2435 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.10267619043588638, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2436 + }, + { + "epoch": 1.9464856230031948, + "grad_norm": 0.12227646261453629, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2437 + }, + { + "epoch": 1.9472843450479234, + "grad_norm": 0.07677904516458511, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2438 + }, + { + "epoch": 1.9480830670926519, + "grad_norm": 0.043213456869125366, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2439 + }, + { + "epoch": 1.9488817891373802, + "grad_norm": 0.0464320071041584, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2440 + }, + { + "epoch": 1.9496805111821085, + "grad_norm": 0.0488814078271389, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2441 + }, + { + "epoch": 1.950479233226837, + "grad_norm": 0.07102649658918381, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2442 + }, + { + "epoch": 1.9512779552715656, + "grad_norm": 0.056355372071266174, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 2443 + }, + { + "epoch": 1.952076677316294, + "grad_norm": 0.05412770435214043, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2444 + }, + { + "epoch": 1.9528753993610224, + "grad_norm": 0.05533284693956375, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2445 + }, + { + "epoch": 1.9536741214057507, + "grad_norm": 0.07065420597791672, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2446 + }, + { + "epoch": 1.9544728434504792, + "grad_norm": 0.0424923375248909, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2447 + }, + { + "epoch": 1.9552715654952078, + "grad_norm": 0.07682394236326218, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2448 + }, + { + "epoch": 1.956070287539936, + "grad_norm": 0.12305673956871033, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2449 + }, + { + "epoch": 1.9568690095846646, + "grad_norm": 0.12699945271015167, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2450 + }, + { + "epoch": 1.957667731629393, + "grad_norm": 0.09973076730966568, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2451 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 0.04687270149588585, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2452 + }, + { + "epoch": 1.95926517571885, + "grad_norm": 0.16843228042125702, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2453 + }, + { + "epoch": 1.9600638977635783, + "grad_norm": 0.27191975712776184, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2454 + }, + { + "epoch": 1.9608626198083066, + "grad_norm": 0.2563989460468292, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2455 + }, + { + "epoch": 1.961661341853035, + "grad_norm": 0.10264059901237488, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2456 + }, + { + "epoch": 1.9624600638977636, + "grad_norm": 0.12051466107368469, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2457 + }, + { + "epoch": 1.9632587859424921, + "grad_norm": 0.27400559186935425, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2458 + }, + { + "epoch": 1.9640575079872205, + "grad_norm": 0.2756473124027252, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2459 + }, + { + "epoch": 1.9648562300319488, + "grad_norm": 0.09925543516874313, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 2460 + }, + { + "epoch": 1.9656549520766773, + "grad_norm": 0.18176420032978058, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2461 + }, + { + "epoch": 1.9664536741214058, + "grad_norm": 0.353693425655365, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 2462 + }, + { + "epoch": 1.9672523961661343, + "grad_norm": 0.30674099922180176, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2463 + }, + { + "epoch": 1.9680511182108626, + "grad_norm": 0.04689846560359001, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2464 + }, + { + "epoch": 1.968849840255591, + "grad_norm": 0.29758918285369873, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 2465 + }, + { + "epoch": 1.9696485623003195, + "grad_norm": 0.363922655582428, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2466 + }, + { + "epoch": 1.970447284345048, + "grad_norm": 0.19258317351341248, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2467 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 0.10317967087030411, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2468 + }, + { + "epoch": 1.9720447284345048, + "grad_norm": 0.2375856637954712, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 2469 + }, + { + "epoch": 1.9728434504792332, + "grad_norm": 0.13130125403404236, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2470 + }, + { + "epoch": 1.9736421725239617, + "grad_norm": 0.08131767064332962, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2471 + }, + { + "epoch": 1.9744408945686902, + "grad_norm": 0.14860530197620392, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 2472 + }, + { + "epoch": 1.9752396166134185, + "grad_norm": 0.11777997016906738, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2473 + }, + { + "epoch": 1.9760383386581468, + "grad_norm": 0.08397025614976883, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2474 + }, + { + "epoch": 1.9768370607028753, + "grad_norm": 0.08824057132005692, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2475 + }, + { + "epoch": 1.9776357827476039, + "grad_norm": 0.06647378206253052, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2476 + }, + { + "epoch": 1.9784345047923324, + "grad_norm": 0.038043633103370667, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2477 + }, + { + "epoch": 1.9792332268370607, + "grad_norm": 0.08245793730020523, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2478 + }, + { + "epoch": 1.980031948881789, + "grad_norm": 0.1402815282344818, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2479 + }, + { + "epoch": 1.9808306709265175, + "grad_norm": 0.15749140083789825, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2480 + }, + { + "epoch": 1.981629392971246, + "grad_norm": 0.09396994858980179, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2481 + }, + { + "epoch": 1.9824281150159746, + "grad_norm": 0.0725923553109169, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2482 + }, + { + "epoch": 1.983226837060703, + "grad_norm": 0.06790316104888916, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 2483 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 0.04050496965646744, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2484 + }, + { + "epoch": 1.9848242811501597, + "grad_norm": 0.04245828837156296, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2485 + }, + { + "epoch": 1.9856230031948883, + "grad_norm": 0.04818668216466904, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2486 + }, + { + "epoch": 1.9864217252396166, + "grad_norm": 0.07091481238603592, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2487 + }, + { + "epoch": 1.9872204472843449, + "grad_norm": 0.08975768834352493, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2488 + }, + { + "epoch": 1.9880191693290734, + "grad_norm": 0.0920509397983551, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2489 + }, + { + "epoch": 1.988817891373802, + "grad_norm": 0.06188343092799187, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2490 + }, + { + "epoch": 1.9896166134185305, + "grad_norm": 0.03998660668730736, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2491 + }, + { + "epoch": 1.9904153354632588, + "grad_norm": 0.03859339654445648, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2492 + }, + { + "epoch": 1.991214057507987, + "grad_norm": 0.050228461623191833, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2493 + }, + { + "epoch": 1.9920127795527156, + "grad_norm": 0.04037710279226303, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2494 + }, + { + "epoch": 1.9928115015974441, + "grad_norm": 0.04584654048085213, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2495 + }, + { + "epoch": 1.9936102236421727, + "grad_norm": 0.03696245700120926, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2496 + }, + { + "epoch": 1.994408945686901, + "grad_norm": 0.04600491747260094, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2497 + }, + { + "epoch": 1.9952076677316293, + "grad_norm": 0.0943571925163269, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2498 + }, + { + "epoch": 1.9960063897763578, + "grad_norm": 0.11350230127573013, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2499 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.09816325455904007, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2500 + }, + { + "epoch": 1.9976038338658149, + "grad_norm": 0.05887974426150322, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2501 + }, + { + "epoch": 1.9984025559105432, + "grad_norm": 0.039232514798641205, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 2502 + }, + { + "epoch": 1.9992012779552715, + "grad_norm": 0.10776908695697784, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2503 + }, + { + "epoch": 2.0, + "grad_norm": 0.1708499789237976, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2504 + }, + { + "epoch": 2.0007987220447285, + "grad_norm": 0.12712575495243073, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2505 + }, + { + "epoch": 2.001597444089457, + "grad_norm": 0.04130035266280174, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2506 + }, + { + "epoch": 2.002396166134185, + "grad_norm": 0.08062197268009186, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2507 + }, + { + "epoch": 2.0031948881789137, + "grad_norm": 0.11429931968450546, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2508 + }, + { + "epoch": 2.003993610223642, + "grad_norm": 0.06290867924690247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 2509 + }, + { + "epoch": 2.0047923322683707, + "grad_norm": 0.043735455721616745, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2510 + }, + { + "epoch": 2.0055910543130993, + "grad_norm": 0.08331973850727081, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2511 + }, + { + "epoch": 2.0063897763578273, + "grad_norm": 0.07424676418304443, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2512 + }, + { + "epoch": 2.007188498402556, + "grad_norm": 0.0450097881257534, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2513 + }, + { + "epoch": 2.0079872204472844, + "grad_norm": 0.05486248433589935, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2514 + }, + { + "epoch": 2.008785942492013, + "grad_norm": 0.03456762805581093, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2515 + }, + { + "epoch": 2.009584664536741, + "grad_norm": 0.060457173734903336, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2516 + }, + { + "epoch": 2.0103833865814695, + "grad_norm": 0.11361896246671677, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2517 + }, + { + "epoch": 2.011182108626198, + "grad_norm": 0.13272768259048462, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2518 + }, + { + "epoch": 2.0119808306709266, + "grad_norm": 0.06579867750406265, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2519 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.06989869475364685, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2520 + }, + { + "epoch": 2.013578274760383, + "grad_norm": 0.10227718949317932, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2521 + }, + { + "epoch": 2.0143769968051117, + "grad_norm": 0.1155320331454277, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2522 + }, + { + "epoch": 2.0151757188498403, + "grad_norm": 0.08428250998258591, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2523 + }, + { + "epoch": 2.015974440894569, + "grad_norm": 0.07322479784488678, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2524 + }, + { + "epoch": 2.0167731629392973, + "grad_norm": 0.0683116540312767, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2525 + }, + { + "epoch": 2.0175718849840254, + "grad_norm": 0.05594201013445854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2526 + }, + { + "epoch": 2.018370607028754, + "grad_norm": 0.08582351356744766, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2527 + }, + { + "epoch": 2.0191693290734825, + "grad_norm": 0.16223077476024628, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2528 + }, + { + "epoch": 2.019968051118211, + "grad_norm": 0.23563791811466217, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2529 + }, + { + "epoch": 2.0207667731629395, + "grad_norm": 0.2101173847913742, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2530 + }, + { + "epoch": 2.0215654952076676, + "grad_norm": 0.14453741908073425, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2531 + }, + { + "epoch": 2.022364217252396, + "grad_norm": 0.050489380955696106, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2532 + }, + { + "epoch": 2.0231629392971247, + "grad_norm": 0.17723125219345093, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2533 + }, + { + "epoch": 2.023961661341853, + "grad_norm": 0.18600088357925415, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 2534 + }, + { + "epoch": 2.0247603833865813, + "grad_norm": 0.10898424685001373, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2535 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.07256787270307541, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2536 + }, + { + "epoch": 2.0263578274760383, + "grad_norm": 0.1978672444820404, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2537 + }, + { + "epoch": 2.027156549520767, + "grad_norm": 0.20623594522476196, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2538 + }, + { + "epoch": 2.0279552715654954, + "grad_norm": 0.08837094157934189, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2539 + }, + { + "epoch": 2.0287539936102235, + "grad_norm": 0.10977557301521301, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2540 + }, + { + "epoch": 2.029552715654952, + "grad_norm": 0.24850067496299744, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2541 + }, + { + "epoch": 2.0303514376996805, + "grad_norm": 0.29207590222358704, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2542 + }, + { + "epoch": 2.031150159744409, + "grad_norm": 0.1985940933227539, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2543 + }, + { + "epoch": 2.0319488817891376, + "grad_norm": 0.04519326612353325, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2544 + }, + { + "epoch": 2.0327476038338657, + "grad_norm": 0.16939495503902435, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2545 + }, + { + "epoch": 2.033546325878594, + "grad_norm": 0.270275354385376, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2546 + }, + { + "epoch": 2.0343450479233227, + "grad_norm": 0.21180108189582825, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2547 + }, + { + "epoch": 2.0351437699680512, + "grad_norm": 0.0469316728413105, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2548 + }, + { + "epoch": 2.0359424920127798, + "grad_norm": 0.1845361739397049, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2549 + }, + { + "epoch": 2.036741214057508, + "grad_norm": 0.2276308536529541, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2550 + }, + { + "epoch": 2.0375399361022364, + "grad_norm": 0.11676277965307236, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2551 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 0.1021813154220581, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2552 + }, + { + "epoch": 2.0391373801916934, + "grad_norm": 0.28504467010498047, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2553 + }, + { + "epoch": 2.0399361022364215, + "grad_norm": 0.2821798324584961, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2554 + }, + { + "epoch": 2.04073482428115, + "grad_norm": 0.09673242270946503, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2555 + }, + { + "epoch": 2.0415335463258786, + "grad_norm": 0.1784241944551468, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2556 + }, + { + "epoch": 2.042332268370607, + "grad_norm": 0.30749815702438354, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2557 + }, + { + "epoch": 2.0431309904153356, + "grad_norm": 0.2625802457332611, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2558 + }, + { + "epoch": 2.0439297124600637, + "grad_norm": 0.0651462972164154, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 2559 + }, + { + "epoch": 2.0447284345047922, + "grad_norm": 0.2103819102048874, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 2560 + }, + { + "epoch": 2.0455271565495208, + "grad_norm": 0.2854102849960327, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2561 + }, + { + "epoch": 2.0463258785942493, + "grad_norm": 0.14184293150901794, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2562 + }, + { + "epoch": 2.047124600638978, + "grad_norm": 0.06151473522186279, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2563 + }, + { + "epoch": 2.047923322683706, + "grad_norm": 0.1858600378036499, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2564 + }, + { + "epoch": 2.0487220447284344, + "grad_norm": 0.19997341930866241, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2565 + }, + { + "epoch": 2.049520766773163, + "grad_norm": 0.0924893170595169, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2566 + }, + { + "epoch": 2.0503194888178915, + "grad_norm": 0.14571507275104523, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2567 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.2566513121128082, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2568 + }, + { + "epoch": 2.051916932907348, + "grad_norm": 0.24462486803531647, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 2569 + }, + { + "epoch": 2.0527156549520766, + "grad_norm": 0.10544434189796448, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 2570 + }, + { + "epoch": 2.053514376996805, + "grad_norm": 0.08675491809844971, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2571 + }, + { + "epoch": 2.0543130990415337, + "grad_norm": 0.18398417532444, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2572 + }, + { + "epoch": 2.055111821086262, + "grad_norm": 0.15167878568172455, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2573 + }, + { + "epoch": 2.0559105431309903, + "grad_norm": 0.06932301074266434, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2574 + }, + { + "epoch": 2.056709265175719, + "grad_norm": 0.06368319690227509, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2575 + }, + { + "epoch": 2.0575079872204474, + "grad_norm": 0.11785905808210373, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2576 + }, + { + "epoch": 2.058306709265176, + "grad_norm": 0.05494855344295502, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2577 + }, + { + "epoch": 2.059105431309904, + "grad_norm": 0.10618741810321808, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2578 + }, + { + "epoch": 2.0599041533546325, + "grad_norm": 0.14729735255241394, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2579 + }, + { + "epoch": 2.060702875399361, + "grad_norm": 0.08014677464962006, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2580 + }, + { + "epoch": 2.0615015974440896, + "grad_norm": 0.07460471242666245, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2581 + }, + { + "epoch": 2.062300319488818, + "grad_norm": 0.12884479761123657, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2582 + }, + { + "epoch": 2.063099041533546, + "grad_norm": 0.11224616318941116, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2583 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.06026687100529671, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2584 + }, + { + "epoch": 2.0646964856230032, + "grad_norm": 0.06690093874931335, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2585 + }, + { + "epoch": 2.0654952076677318, + "grad_norm": 0.10095079988241196, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2586 + }, + { + "epoch": 2.06629392971246, + "grad_norm": 0.08353506028652191, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2587 + }, + { + "epoch": 2.0670926517571884, + "grad_norm": 0.07060668617486954, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2588 + }, + { + "epoch": 2.067891373801917, + "grad_norm": 0.07298587262630463, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2589 + }, + { + "epoch": 2.0686900958466454, + "grad_norm": 0.04319034889340401, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2590 + }, + { + "epoch": 2.069488817891374, + "grad_norm": 0.04229504242539406, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2591 + }, + { + "epoch": 2.070287539936102, + "grad_norm": 0.05476998910307884, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2592 + }, + { + "epoch": 2.0710862619808306, + "grad_norm": 0.039188139140605927, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2593 + }, + { + "epoch": 2.071884984025559, + "grad_norm": 0.058993417769670486, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2594 + }, + { + "epoch": 2.0726837060702876, + "grad_norm": 0.04871759191155434, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2595 + }, + { + "epoch": 2.073482428115016, + "grad_norm": 0.037119925022125244, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2596 + }, + { + "epoch": 2.0742811501597442, + "grad_norm": 0.06476760655641556, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 2597 + }, + { + "epoch": 2.0750798722044728, + "grad_norm": 0.03558475151658058, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 2598 + }, + { + "epoch": 2.0758785942492013, + "grad_norm": 0.03988872841000557, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2599 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.04446236789226532, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2600 + }, + { + "epoch": 2.0774760383386583, + "grad_norm": 0.058075740933418274, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2601 + }, + { + "epoch": 2.0782747603833864, + "grad_norm": 0.10492820292711258, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2602 + }, + { + "epoch": 2.079073482428115, + "grad_norm": 0.1374005526304245, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2603 + }, + { + "epoch": 2.0798722044728435, + "grad_norm": 0.10932788252830505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 2604 + }, + { + "epoch": 2.080670926517572, + "grad_norm": 0.035826049745082855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2605 + }, + { + "epoch": 2.0814696485623, + "grad_norm": 0.10934802889823914, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2606 + }, + { + "epoch": 2.0822683706070286, + "grad_norm": 0.13302485644817352, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2607 + }, + { + "epoch": 2.083067092651757, + "grad_norm": 0.11253390461206436, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2608 + }, + { + "epoch": 2.0838658146964857, + "grad_norm": 0.04634593054652214, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 2609 + }, + { + "epoch": 2.084664536741214, + "grad_norm": 0.21137909591197968, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2610 + }, + { + "epoch": 2.0854632587859423, + "grad_norm": 0.2771414816379547, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2611 + }, + { + "epoch": 2.086261980830671, + "grad_norm": 0.1959906965494156, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2612 + }, + { + "epoch": 2.0870607028753994, + "grad_norm": 0.042694322764873505, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 2613 + }, + { + "epoch": 2.087859424920128, + "grad_norm": 0.15753871202468872, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2614 + }, + { + "epoch": 2.0886581469648564, + "grad_norm": 0.1917339563369751, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2615 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.05056089907884598, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2616 + }, + { + "epoch": 2.090255591054313, + "grad_norm": 0.16167999804019928, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2617 + }, + { + "epoch": 2.0910543130990416, + "grad_norm": 0.21019205451011658, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2618 + }, + { + "epoch": 2.09185303514377, + "grad_norm": 0.12859253585338593, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2619 + }, + { + "epoch": 2.0926517571884986, + "grad_norm": 0.04561556130647659, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2620 + }, + { + "epoch": 2.0934504792332267, + "grad_norm": 0.19915086030960083, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2621 + }, + { + "epoch": 2.094249201277955, + "grad_norm": 0.2792043685913086, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2622 + }, + { + "epoch": 2.0950479233226837, + "grad_norm": 0.16861289739608765, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2623 + }, + { + "epoch": 2.0958466453674123, + "grad_norm": 0.08431511372327805, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2624 + }, + { + "epoch": 2.0966453674121404, + "grad_norm": 0.26860734820365906, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2625 + }, + { + "epoch": 2.097444089456869, + "grad_norm": 0.2949545979499817, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2626 + }, + { + "epoch": 2.0982428115015974, + "grad_norm": 0.12639857828617096, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2627 + }, + { + "epoch": 2.099041533546326, + "grad_norm": 0.14675533771514893, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2628 + }, + { + "epoch": 2.0998402555910545, + "grad_norm": 0.29298654198646545, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2629 + }, + { + "epoch": 2.1006389776357826, + "grad_norm": 0.20049460232257843, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2630 + }, + { + "epoch": 2.101437699680511, + "grad_norm": 0.05280651897192001, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2631 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.2405036836862564, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2632 + }, + { + "epoch": 2.103035143769968, + "grad_norm": 0.29925718903541565, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2633 + }, + { + "epoch": 2.1038338658146967, + "grad_norm": 0.1330690085887909, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2634 + }, + { + "epoch": 2.1046325878594248, + "grad_norm": 0.11366300284862518, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2635 + }, + { + "epoch": 2.1054313099041533, + "grad_norm": 0.184611514210701, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2636 + }, + { + "epoch": 2.106230031948882, + "grad_norm": 0.0942547619342804, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2637 + }, + { + "epoch": 2.1070287539936103, + "grad_norm": 0.09224486351013184, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2638 + }, + { + "epoch": 2.107827476038339, + "grad_norm": 0.2167433351278305, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2639 + }, + { + "epoch": 2.108626198083067, + "grad_norm": 0.20001453161239624, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2640 + }, + { + "epoch": 2.1094249201277955, + "grad_norm": 0.0551394522190094, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2641 + }, + { + "epoch": 2.110223642172524, + "grad_norm": 0.14991897344589233, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2642 + }, + { + "epoch": 2.1110223642172525, + "grad_norm": 0.21038007736206055, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2643 + }, + { + "epoch": 2.1118210862619806, + "grad_norm": 0.11942024528980255, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2644 + }, + { + "epoch": 2.112619808306709, + "grad_norm": 0.14938029646873474, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 2645 + }, + { + "epoch": 2.1134185303514377, + "grad_norm": 0.3405923843383789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2646 + }, + { + "epoch": 2.114217252396166, + "grad_norm": 0.3363925814628601, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2647 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.12379220873117447, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2648 + }, + { + "epoch": 2.115814696485623, + "grad_norm": 0.1583731323480606, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2649 + }, + { + "epoch": 2.1166134185303513, + "grad_norm": 0.2941076457500458, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2650 + }, + { + "epoch": 2.11741214057508, + "grad_norm": 0.18513287603855133, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2651 + }, + { + "epoch": 2.1182108626198084, + "grad_norm": 0.057797662913799286, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2652 + }, + { + "epoch": 2.119009584664537, + "grad_norm": 0.12461342662572861, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2653 + }, + { + "epoch": 2.119808306709265, + "grad_norm": 0.06276709586381912, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2654 + }, + { + "epoch": 2.1206070287539935, + "grad_norm": 0.06073528528213501, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2655 + }, + { + "epoch": 2.121405750798722, + "grad_norm": 0.07055814564228058, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 2656 + }, + { + "epoch": 2.1222044728434506, + "grad_norm": 0.03508429974317551, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2657 + }, + { + "epoch": 2.123003194888179, + "grad_norm": 0.0474206916987896, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2658 + }, + { + "epoch": 2.123801916932907, + "grad_norm": 0.04067448526620865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2659 + }, + { + "epoch": 2.1246006389776357, + "grad_norm": 0.060025133192539215, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2660 + }, + { + "epoch": 2.1253993610223643, + "grad_norm": 0.061696235090494156, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2661 + }, + { + "epoch": 2.126198083067093, + "grad_norm": 0.060907844454050064, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2662 + }, + { + "epoch": 2.126996805111821, + "grad_norm": 0.06122025474905968, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 2663 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.06885300576686859, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 2664 + }, + { + "epoch": 2.128594249201278, + "grad_norm": 0.047428976744413376, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2665 + }, + { + "epoch": 2.1293929712460065, + "grad_norm": 0.036644674837589264, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2666 + }, + { + "epoch": 2.130191693290735, + "grad_norm": 0.04983266070485115, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2667 + }, + { + "epoch": 2.130990415335463, + "grad_norm": 0.09072417765855789, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2668 + }, + { + "epoch": 2.1317891373801916, + "grad_norm": 0.10644412785768509, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 2669 + }, + { + "epoch": 2.13258785942492, + "grad_norm": 0.07350479066371918, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2670 + }, + { + "epoch": 2.1333865814696487, + "grad_norm": 0.041709840297698975, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 2671 + }, + { + "epoch": 2.134185303514377, + "grad_norm": 0.043592557311058044, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2672 + }, + { + "epoch": 2.1349840255591053, + "grad_norm": 0.04548558592796326, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2673 + }, + { + "epoch": 2.135782747603834, + "grad_norm": 0.03937267139554024, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2674 + }, + { + "epoch": 2.1365814696485623, + "grad_norm": 0.05674131214618683, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2675 + }, + { + "epoch": 2.137380191693291, + "grad_norm": 0.0857989713549614, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2676 + }, + { + "epoch": 2.1381789137380194, + "grad_norm": 0.12659871578216553, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2677 + }, + { + "epoch": 2.1389776357827475, + "grad_norm": 0.10000529885292053, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2678 + }, + { + "epoch": 2.139776357827476, + "grad_norm": 0.060805950313806534, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2679 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.20407895743846893, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2680 + }, + { + "epoch": 2.141373801916933, + "grad_norm": 0.21931609511375427, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2681 + }, + { + "epoch": 2.142172523961661, + "grad_norm": 0.0947318896651268, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2682 + }, + { + "epoch": 2.1429712460063897, + "grad_norm": 0.10082453489303589, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2683 + }, + { + "epoch": 2.143769968051118, + "grad_norm": 0.2510482370853424, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 2684 + }, + { + "epoch": 2.1445686900958467, + "grad_norm": 0.2802210748195648, + "learning_rate": 0.0005, + "loss": 1.0368, + "step": 2685 + }, + { + "epoch": 2.1453674121405752, + "grad_norm": 0.18770602345466614, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2686 + }, + { + "epoch": 2.1461661341853033, + "grad_norm": 0.048588722944259644, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2687 + }, + { + "epoch": 2.146964856230032, + "grad_norm": 0.1443304419517517, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2688 + }, + { + "epoch": 2.1477635782747604, + "grad_norm": 0.22439543902873993, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2689 + }, + { + "epoch": 2.148562300319489, + "grad_norm": 0.16312581300735474, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2690 + }, + { + "epoch": 2.1493610223642174, + "grad_norm": 0.08721408247947693, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2691 + }, + { + "epoch": 2.1501597444089455, + "grad_norm": 0.2756902873516083, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2692 + }, + { + "epoch": 2.150958466453674, + "grad_norm": 0.2834199070930481, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 2693 + }, + { + "epoch": 2.1517571884984026, + "grad_norm": 0.1190086081624031, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 2694 + }, + { + "epoch": 2.152555910543131, + "grad_norm": 0.1246909499168396, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2695 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.2244880348443985, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 2696 + }, + { + "epoch": 2.1541533546325877, + "grad_norm": 0.1424233317375183, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2697 + }, + { + "epoch": 2.1549520766773163, + "grad_norm": 0.10756697505712509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2698 + }, + { + "epoch": 2.155750798722045, + "grad_norm": 0.1688450276851654, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2699 + }, + { + "epoch": 2.1565495207667733, + "grad_norm": 0.12139362096786499, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 2700 + }, + { + "epoch": 2.1573482428115014, + "grad_norm": 0.07833441346883774, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 2701 + }, + { + "epoch": 2.15814696485623, + "grad_norm": 0.22099994122982025, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2702 + }, + { + "epoch": 2.1589456869009584, + "grad_norm": 0.190511554479599, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2703 + }, + { + "epoch": 2.159744408945687, + "grad_norm": 0.07637764513492584, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2704 + }, + { + "epoch": 2.1605431309904155, + "grad_norm": 0.06381702423095703, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2705 + }, + { + "epoch": 2.1613418530351436, + "grad_norm": 0.1343991458415985, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2706 + }, + { + "epoch": 2.162140575079872, + "grad_norm": 0.13090470433235168, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2707 + }, + { + "epoch": 2.1629392971246006, + "grad_norm": 0.04627209156751633, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2708 + }, + { + "epoch": 2.163738019169329, + "grad_norm": 0.060849517583847046, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2709 + }, + { + "epoch": 2.1645367412140577, + "grad_norm": 0.06780707836151123, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2710 + }, + { + "epoch": 2.165335463258786, + "grad_norm": 0.07282490283250809, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 2711 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 0.07168543338775635, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2712 + }, + { + "epoch": 2.166932907348243, + "grad_norm": 0.08716403692960739, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2713 + }, + { + "epoch": 2.1677316293929714, + "grad_norm": 0.09366965293884277, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2714 + }, + { + "epoch": 2.1685303514377, + "grad_norm": 0.09121392667293549, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2715 + }, + { + "epoch": 2.169329073482428, + "grad_norm": 0.06912577152252197, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2716 + }, + { + "epoch": 2.1701277955271565, + "grad_norm": 0.046476542949676514, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 2717 + }, + { + "epoch": 2.170926517571885, + "grad_norm": 0.04065564647316933, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 2718 + }, + { + "epoch": 2.1717252396166136, + "grad_norm": 0.044998086988925934, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2719 + }, + { + "epoch": 2.1725239616613417, + "grad_norm": 0.04588993638753891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2720 + }, + { + "epoch": 2.17332268370607, + "grad_norm": 0.05954091623425484, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 2721 + }, + { + "epoch": 2.1741214057507987, + "grad_norm": 0.07627220451831818, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2722 + }, + { + "epoch": 2.1749201277955272, + "grad_norm": 0.0832771435379982, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2723 + }, + { + "epoch": 2.1757188498402558, + "grad_norm": 0.09901522845029831, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2724 + }, + { + "epoch": 2.176517571884984, + "grad_norm": 0.05773104354739189, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2725 + }, + { + "epoch": 2.1773162939297124, + "grad_norm": 0.0783318281173706, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2726 + }, + { + "epoch": 2.178115015974441, + "grad_norm": 0.12447014451026917, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2727 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.08944697678089142, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2728 + }, + { + "epoch": 2.179712460063898, + "grad_norm": 0.07295451313257217, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2729 + }, + { + "epoch": 2.180511182108626, + "grad_norm": 0.1335693746805191, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2730 + }, + { + "epoch": 2.1813099041533546, + "grad_norm": 0.14618094265460968, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2731 + }, + { + "epoch": 2.182108626198083, + "grad_norm": 0.05047796294093132, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 2732 + }, + { + "epoch": 2.1829073482428116, + "grad_norm": 0.18955212831497192, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2733 + }, + { + "epoch": 2.18370607028754, + "grad_norm": 0.3394540250301361, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2734 + }, + { + "epoch": 2.1845047923322682, + "grad_norm": 0.34607887268066406, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2735 + }, + { + "epoch": 2.1853035143769968, + "grad_norm": 0.19489939510822296, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2736 + }, + { + "epoch": 2.1861022364217253, + "grad_norm": 0.06775379180908203, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 2737 + }, + { + "epoch": 2.186900958466454, + "grad_norm": 0.2376859039068222, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2738 + }, + { + "epoch": 2.187699680511182, + "grad_norm": 0.22686026990413666, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2739 + }, + { + "epoch": 2.1884984025559104, + "grad_norm": 0.059437282383441925, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2740 + }, + { + "epoch": 2.189297124600639, + "grad_norm": 0.184672549366951, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2741 + }, + { + "epoch": 2.1900958466453675, + "grad_norm": 0.21975156664848328, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2742 + }, + { + "epoch": 2.190894568690096, + "grad_norm": 0.08795829117298126, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 2743 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.1045440062880516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2744 + }, + { + "epoch": 2.1924920127795526, + "grad_norm": 0.21037985384464264, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 2745 + }, + { + "epoch": 2.193290734824281, + "grad_norm": 0.17791713774204254, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 2746 + }, + { + "epoch": 2.1940894568690097, + "grad_norm": 0.06028178334236145, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2747 + }, + { + "epoch": 2.194888178913738, + "grad_norm": 0.0801217257976532, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2748 + }, + { + "epoch": 2.1956869009584663, + "grad_norm": 0.11564524471759796, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2749 + }, + { + "epoch": 2.196485623003195, + "grad_norm": 0.0652003139257431, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2750 + }, + { + "epoch": 2.1972843450479234, + "grad_norm": 0.057818979024887085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2751 + }, + { + "epoch": 2.198083067092652, + "grad_norm": 0.10466332733631134, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 2752 + }, + { + "epoch": 2.1988817891373804, + "grad_norm": 0.09350129216909409, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 2753 + }, + { + "epoch": 2.1996805111821085, + "grad_norm": 0.04295926168560982, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2754 + }, + { + "epoch": 2.200479233226837, + "grad_norm": 0.0851534903049469, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2755 + }, + { + "epoch": 2.2012779552715656, + "grad_norm": 0.1857217401266098, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 2756 + }, + { + "epoch": 2.202076677316294, + "grad_norm": 0.18267984688282013, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 2757 + }, + { + "epoch": 2.202875399361022, + "grad_norm": 0.07249841094017029, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2758 + }, + { + "epoch": 2.2036741214057507, + "grad_norm": 0.14335495233535767, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2759 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.24338914453983307, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2760 + }, + { + "epoch": 2.2052715654952078, + "grad_norm": 0.17772778868675232, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 2761 + }, + { + "epoch": 2.2060702875399363, + "grad_norm": 0.04809113219380379, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2762 + }, + { + "epoch": 2.2068690095846644, + "grad_norm": 0.09682228416204453, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2763 + }, + { + "epoch": 2.207667731629393, + "grad_norm": 0.13868102431297302, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2764 + }, + { + "epoch": 2.2084664536741214, + "grad_norm": 0.10956277698278427, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2765 + }, + { + "epoch": 2.20926517571885, + "grad_norm": 0.06163526698946953, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2766 + }, + { + "epoch": 2.2100638977635785, + "grad_norm": 0.14519700407981873, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2767 + }, + { + "epoch": 2.2108626198083066, + "grad_norm": 0.12486071139574051, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2768 + }, + { + "epoch": 2.211661341853035, + "grad_norm": 0.0414549857378006, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2769 + }, + { + "epoch": 2.2124600638977636, + "grad_norm": 0.13828913867473602, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 2770 + }, + { + "epoch": 2.213258785942492, + "grad_norm": 0.18277914822101593, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2771 + }, + { + "epoch": 2.2140575079872207, + "grad_norm": 0.15727964043617249, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 2772 + }, + { + "epoch": 2.2148562300319488, + "grad_norm": 0.07437993586063385, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2773 + }, + { + "epoch": 2.2156549520766773, + "grad_norm": 0.08192550390958786, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2774 + }, + { + "epoch": 2.216453674121406, + "grad_norm": 0.1804617941379547, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2775 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.18431466817855835, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2776 + }, + { + "epoch": 2.2180511182108624, + "grad_norm": 0.11281057447195053, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2777 + }, + { + "epoch": 2.218849840255591, + "grad_norm": 0.0398496650159359, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 2778 + }, + { + "epoch": 2.2196485623003195, + "grad_norm": 0.16930198669433594, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2779 + }, + { + "epoch": 2.220447284345048, + "grad_norm": 0.2384660542011261, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2780 + }, + { + "epoch": 2.2212460063897765, + "grad_norm": 0.18867406249046326, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2781 + }, + { + "epoch": 2.2220447284345046, + "grad_norm": 0.041189488023519516, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 2782 + }, + { + "epoch": 2.222843450479233, + "grad_norm": 0.21946212649345398, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2783 + }, + { + "epoch": 2.2236421725239617, + "grad_norm": 0.3394725024700165, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2784 + }, + { + "epoch": 2.22444089456869, + "grad_norm": 0.09503358602523804, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2785 + }, + { + "epoch": 2.2252396166134187, + "grad_norm": 0.180524080991745, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 2786 + }, + { + "epoch": 2.226038338658147, + "grad_norm": 0.2961865961551666, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 2787 + }, + { + "epoch": 2.2268370607028753, + "grad_norm": 0.25913500785827637, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2788 + }, + { + "epoch": 2.227635782747604, + "grad_norm": 0.08123381435871124, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2789 + }, + { + "epoch": 2.2284345047923324, + "grad_norm": 0.18587692081928253, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2790 + }, + { + "epoch": 2.229233226837061, + "grad_norm": 0.29838815331459045, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 2791 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.2115599811077118, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 2792 + }, + { + "epoch": 2.2308306709265175, + "grad_norm": 0.04708286374807358, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2793 + }, + { + "epoch": 2.231629392971246, + "grad_norm": 0.224795401096344, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 2794 + }, + { + "epoch": 2.2324281150159746, + "grad_norm": 0.2673366665840149, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2795 + }, + { + "epoch": 2.2332268370607027, + "grad_norm": 0.1223720833659172, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2796 + }, + { + "epoch": 2.234025559105431, + "grad_norm": 0.12798862159252167, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2797 + }, + { + "epoch": 2.2348242811501597, + "grad_norm": 0.25721317529678345, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2798 + }, + { + "epoch": 2.2356230031948883, + "grad_norm": 0.16970157623291016, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 2799 + }, + { + "epoch": 2.236421725239617, + "grad_norm": 0.1311950534582138, + "learning_rate": 0.0005, + "loss": 1.0367, + "step": 2800 + }, + { + "epoch": 2.237220447284345, + "grad_norm": 0.32154732942581177, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2801 + }, + { + "epoch": 2.2380191693290734, + "grad_norm": 0.23601645231246948, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2802 + }, + { + "epoch": 2.238817891373802, + "grad_norm": 0.08307314664125443, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2803 + }, + { + "epoch": 2.2396166134185305, + "grad_norm": 0.31183329224586487, + "learning_rate": 0.0005, + "loss": 1.0379, + "step": 2804 + }, + { + "epoch": 2.2404153354632586, + "grad_norm": 0.27391767501831055, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2805 + }, + { + "epoch": 2.241214057507987, + "grad_norm": 0.07247646898031235, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2806 + }, + { + "epoch": 2.2420127795527156, + "grad_norm": 0.1882690042257309, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2807 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.18179158866405487, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2808 + }, + { + "epoch": 2.2436102236421727, + "grad_norm": 0.10761548578739166, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2809 + }, + { + "epoch": 2.244408945686901, + "grad_norm": 0.3067700266838074, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2810 + }, + { + "epoch": 2.2452076677316293, + "grad_norm": 0.17450691759586334, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 2811 + }, + { + "epoch": 2.246006389776358, + "grad_norm": 0.14480780065059662, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 2812 + }, + { + "epoch": 2.2468051118210863, + "grad_norm": 0.3325321078300476, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 2813 + }, + { + "epoch": 2.247603833865815, + "grad_norm": 0.26238250732421875, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2814 + }, + { + "epoch": 2.248402555910543, + "grad_norm": 0.07829522341489792, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 2815 + }, + { + "epoch": 2.2492012779552715, + "grad_norm": 0.269721657037735, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 2816 + }, + { + "epoch": 2.25, + "grad_norm": 0.16362956166267395, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2817 + }, + { + "epoch": 2.2507987220447285, + "grad_norm": 0.08129733055830002, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2818 + }, + { + "epoch": 2.251597444089457, + "grad_norm": 0.18430721759796143, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2819 + }, + { + "epoch": 2.252396166134185, + "grad_norm": 0.09634844213724136, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 2820 + }, + { + "epoch": 2.2531948881789137, + "grad_norm": 0.08204549551010132, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2821 + }, + { + "epoch": 2.253993610223642, + "grad_norm": 0.1140882819890976, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 2822 + }, + { + "epoch": 2.2547923322683707, + "grad_norm": 0.05056345462799072, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2823 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.06505320966243744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2824 + }, + { + "epoch": 2.2563897763578273, + "grad_norm": 0.11316727101802826, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2825 + }, + { + "epoch": 2.257188498402556, + "grad_norm": 0.1036633774638176, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2826 + }, + { + "epoch": 2.2579872204472844, + "grad_norm": 0.0470670685172081, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 2827 + }, + { + "epoch": 2.258785942492013, + "grad_norm": 0.0880327895283699, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2828 + }, + { + "epoch": 2.2595846645367414, + "grad_norm": 0.07664912939071655, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2829 + }, + { + "epoch": 2.2603833865814695, + "grad_norm": 0.049471575766801834, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2830 + }, + { + "epoch": 2.261182108626198, + "grad_norm": 0.04288775101304054, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2831 + }, + { + "epoch": 2.2619808306709266, + "grad_norm": 0.10124537348747253, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2832 + }, + { + "epoch": 2.262779552715655, + "grad_norm": 0.13865061104297638, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2833 + }, + { + "epoch": 2.263578274760383, + "grad_norm": 0.10227467864751816, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2834 + }, + { + "epoch": 2.2643769968051117, + "grad_norm": 0.050575822591781616, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 2835 + }, + { + "epoch": 2.2651757188498403, + "grad_norm": 0.044946715235710144, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2836 + }, + { + "epoch": 2.265974440894569, + "grad_norm": 0.0712895616889, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 2837 + }, + { + "epoch": 2.2667731629392973, + "grad_norm": 0.07044374942779541, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2838 + }, + { + "epoch": 2.2675718849840254, + "grad_norm": 0.04518461972475052, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2839 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 0.05259617418050766, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2840 + }, + { + "epoch": 2.2691693290734825, + "grad_norm": 0.0654863640666008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 2841 + }, + { + "epoch": 2.269968051118211, + "grad_norm": 0.04345248267054558, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 2842 + }, + { + "epoch": 2.270766773162939, + "grad_norm": 0.057224296033382416, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2843 + }, + { + "epoch": 2.2715654952076676, + "grad_norm": 0.11091717332601547, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 2844 + }, + { + "epoch": 2.272364217252396, + "grad_norm": 0.11426062136888504, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 2845 + }, + { + "epoch": 2.2731629392971247, + "grad_norm": 0.10064966231584549, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2846 + }, + { + "epoch": 2.273961661341853, + "grad_norm": 0.13716623187065125, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 2847 + }, + { + "epoch": 2.2747603833865817, + "grad_norm": 0.09014318138360977, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 2848 + }, + { + "epoch": 2.27555910543131, + "grad_norm": 0.16652478277683258, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2849 + }, + { + "epoch": 2.2763578274760383, + "grad_norm": 0.14217601716518402, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 2850 + }, + { + "epoch": 2.277156549520767, + "grad_norm": 0.03895508497953415, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 2851 + }, + { + "epoch": 2.2779552715654954, + "grad_norm": 0.17713558673858643, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2852 + }, + { + "epoch": 2.2787539936102235, + "grad_norm": 0.32960572838783264, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 2853 + }, + { + "epoch": 2.279552715654952, + "grad_norm": 0.2481910139322281, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 2854 + }, + { + "epoch": 2.2803514376996805, + "grad_norm": 0.06643390655517578, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2855 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.17466357350349426, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 2856 + }, + { + "epoch": 2.2819488817891376, + "grad_norm": 0.27781131863594055, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 2857 + }, + { + "epoch": 2.2827476038338657, + "grad_norm": 0.19475431740283966, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 2858 + }, + { + "epoch": 2.283546325878594, + "grad_norm": 0.07700221985578537, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 2859 + }, + { + "epoch": 2.2843450479233227, + "grad_norm": 0.22520926594734192, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 2860 + }, + { + "epoch": 2.2851437699680512, + "grad_norm": 0.18735183775424957, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2861 + }, + { + "epoch": 2.2859424920127793, + "grad_norm": 0.04133198782801628, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2862 + }, + { + "epoch": 2.286741214057508, + "grad_norm": 0.2526150941848755, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2863 + }, + { + "epoch": 2.2875399361022364, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2864 + }, + { + "epoch": 2.288338658146965, + "grad_norm": 0.12839898467063904, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2865 + }, + { + "epoch": 2.2891373801916934, + "grad_norm": 0.1259411871433258, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 2866 + }, + { + "epoch": 2.289936102236422, + "grad_norm": 0.25480905175209045, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 2867 + }, + { + "epoch": 2.29073482428115, + "grad_norm": 0.15650653839111328, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2868 + }, + { + "epoch": 2.2915335463258786, + "grad_norm": 0.07474946230649948, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 2869 + }, + { + "epoch": 2.292332268370607, + "grad_norm": 0.170192688703537, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2870 + }, + { + "epoch": 2.2931309904153356, + "grad_norm": 0.13292376697063446, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2871 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 0.045553866773843765, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 2872 + }, + { + "epoch": 2.2947284345047922, + "grad_norm": 0.10853269696235657, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2873 + }, + { + "epoch": 2.2955271565495208, + "grad_norm": 0.09945288300514221, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2874 + }, + { + "epoch": 2.2963258785942493, + "grad_norm": 0.039073117077350616, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2875 + }, + { + "epoch": 2.297124600638978, + "grad_norm": 0.05867530405521393, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2876 + }, + { + "epoch": 2.297923322683706, + "grad_norm": 0.07227179408073425, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2877 + }, + { + "epoch": 2.2987220447284344, + "grad_norm": 0.04456201195716858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 2878 + }, + { + "epoch": 2.299520766773163, + "grad_norm": 0.11672481894493103, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 2879 + }, + { + "epoch": 2.3003194888178915, + "grad_norm": 0.12335679680109024, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 2880 + }, + { + "epoch": 2.3011182108626196, + "grad_norm": 0.043409012258052826, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 2881 + }, + { + "epoch": 2.301916932907348, + "grad_norm": 0.09896806627511978, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 2882 + }, + { + "epoch": 2.3027156549520766, + "grad_norm": 0.2037963569164276, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2883 + }, + { + "epoch": 2.303514376996805, + "grad_norm": 0.21378903090953827, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2884 + }, + { + "epoch": 2.3043130990415337, + "grad_norm": 0.062362927943468094, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2885 + }, + { + "epoch": 2.3051118210862622, + "grad_norm": 0.17370136082172394, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 2886 + }, + { + "epoch": 2.3059105431309903, + "grad_norm": 0.23190435767173767, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 2887 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.08148342370986938, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 2888 + }, + { + "epoch": 2.3075079872204474, + "grad_norm": 0.1596807837486267, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 2889 + }, + { + "epoch": 2.308306709265176, + "grad_norm": 0.26396819949150085, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2890 + }, + { + "epoch": 2.309105431309904, + "grad_norm": 0.1509561687707901, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 2891 + }, + { + "epoch": 2.3099041533546325, + "grad_norm": 0.09147104620933533, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 2892 + }, + { + "epoch": 2.310702875399361, + "grad_norm": 0.23575374484062195, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2893 + }, + { + "epoch": 2.3115015974440896, + "grad_norm": 0.18403767049312592, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 2894 + }, + { + "epoch": 2.312300319488818, + "grad_norm": 0.052600763738155365, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 2895 + }, + { + "epoch": 2.313099041533546, + "grad_norm": 0.18707415461540222, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2896 + }, + { + "epoch": 2.3138977635782747, + "grad_norm": 0.20824143290519714, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 2897 + }, + { + "epoch": 2.3146964856230032, + "grad_norm": 0.0775759220123291, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2898 + }, + { + "epoch": 2.3154952076677318, + "grad_norm": 0.10904766619205475, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 2899 + }, + { + "epoch": 2.31629392971246, + "grad_norm": 0.1562514752149582, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 2900 + }, + { + "epoch": 2.3170926517571884, + "grad_norm": 0.06689859926700592, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2901 + }, + { + "epoch": 2.317891373801917, + "grad_norm": 0.0887206643819809, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 2902 + }, + { + "epoch": 2.3186900958466454, + "grad_norm": 0.13615944981575012, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2903 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.08094146102666855, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 2904 + }, + { + "epoch": 2.3202875399361025, + "grad_norm": 0.06734368950128555, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 2905 + }, + { + "epoch": 2.3210862619808306, + "grad_norm": 0.17405667901039124, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 2906 + }, + { + "epoch": 2.321884984025559, + "grad_norm": 0.23022079467773438, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 2907 + }, + { + "epoch": 2.3226837060702876, + "grad_norm": 0.17341896891593933, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 2908 + }, + { + "epoch": 2.323482428115016, + "grad_norm": 0.037751875817775726, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 2909 + }, + { + "epoch": 2.3242811501597442, + "grad_norm": 0.12434598803520203, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2910 + }, + { + "epoch": 2.3250798722044728, + "grad_norm": 0.11344511806964874, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 2911 + }, + { + "epoch": 2.3258785942492013, + "grad_norm": 0.05426390469074249, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 2912 + }, + { + "epoch": 2.32667731629393, + "grad_norm": 0.11261611431837082, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 2913 + }, + { + "epoch": 2.3274760383386583, + "grad_norm": 0.22023531794548035, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2914 + }, + { + "epoch": 2.3282747603833864, + "grad_norm": 0.2050291895866394, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 2915 + }, + { + "epoch": 2.329073482428115, + "grad_norm": 0.05478905141353607, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2916 + }, + { + "epoch": 2.3298722044728435, + "grad_norm": 0.15363283455371857, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 2917 + }, + { + "epoch": 2.330670926517572, + "grad_norm": 0.17348943650722504, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2918 + }, + { + "epoch": 2.3314696485623, + "grad_norm": 0.05366649851202965, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2919 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 0.16219462454319, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2920 + }, + { + "epoch": 2.333067092651757, + "grad_norm": 0.23911446332931519, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2921 + }, + { + "epoch": 2.3338658146964857, + "grad_norm": 0.12384039163589478, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 2922 + }, + { + "epoch": 2.334664536741214, + "grad_norm": 0.08747945725917816, + "learning_rate": 0.0005, + "loss": 1.0351, + "step": 2923 + }, + { + "epoch": 2.3354632587859427, + "grad_norm": 0.19737359881401062, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2924 + }, + { + "epoch": 2.336261980830671, + "grad_norm": 0.11312227696180344, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 2925 + }, + { + "epoch": 2.3370607028753994, + "grad_norm": 0.09944877028465271, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 2926 + }, + { + "epoch": 2.337859424920128, + "grad_norm": 0.23282872140407562, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 2927 + }, + { + "epoch": 2.3386581469648564, + "grad_norm": 0.14369411766529083, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2928 + }, + { + "epoch": 2.3394568690095845, + "grad_norm": 0.07267388701438904, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2929 + }, + { + "epoch": 2.340255591054313, + "grad_norm": 0.18751965463161469, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2930 + }, + { + "epoch": 2.3410543130990416, + "grad_norm": 0.20886634290218353, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2931 + }, + { + "epoch": 2.34185303514377, + "grad_norm": 0.11675436794757843, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2932 + }, + { + "epoch": 2.3426517571884986, + "grad_norm": 0.08915580064058304, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 2933 + }, + { + "epoch": 2.3434504792332267, + "grad_norm": 0.1534406840801239, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 2934 + }, + { + "epoch": 2.344249201277955, + "grad_norm": 0.08791724592447281, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 2935 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 0.04647858813405037, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 2936 + }, + { + "epoch": 2.3458466453674123, + "grad_norm": 0.09236840158700943, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 2937 + }, + { + "epoch": 2.3466453674121404, + "grad_norm": 0.09079006314277649, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 2938 + }, + { + "epoch": 2.347444089456869, + "grad_norm": 0.03492455556988716, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 2939 + }, + { + "epoch": 2.3482428115015974, + "grad_norm": 0.11871617287397385, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2940 + }, + { + "epoch": 2.349041533546326, + "grad_norm": 0.10904752463102341, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2941 + }, + { + "epoch": 2.3498402555910545, + "grad_norm": 0.05331781879067421, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 2942 + }, + { + "epoch": 2.3506389776357826, + "grad_norm": 0.1213313564658165, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 2943 + }, + { + "epoch": 2.351437699680511, + "grad_norm": 0.12995922565460205, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 2944 + }, + { + "epoch": 2.3522364217252396, + "grad_norm": 0.05770767107605934, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 2945 + }, + { + "epoch": 2.353035143769968, + "grad_norm": 0.09310754388570786, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 2946 + }, + { + "epoch": 2.3538338658146967, + "grad_norm": 0.17539645731449127, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 2947 + }, + { + "epoch": 2.3546325878594248, + "grad_norm": 0.14126333594322205, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2948 + }, + { + "epoch": 2.3554313099041533, + "grad_norm": 0.04220091179013252, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2949 + }, + { + "epoch": 2.356230031948882, + "grad_norm": 0.14341594278812408, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 2950 + }, + { + "epoch": 2.3570287539936103, + "grad_norm": 0.13884525001049042, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 2951 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 0.040859755128622055, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 2952 + }, + { + "epoch": 2.358626198083067, + "grad_norm": 0.14475658535957336, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 2953 + }, + { + "epoch": 2.3594249201277955, + "grad_norm": 0.18962377309799194, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 2954 + }, + { + "epoch": 2.360223642172524, + "grad_norm": 0.0909075066447258, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2955 + }, + { + "epoch": 2.3610223642172525, + "grad_norm": 0.08225106447935104, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2956 + }, + { + "epoch": 2.3618210862619806, + "grad_norm": 0.1564486026763916, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 2957 + }, + { + "epoch": 2.362619808306709, + "grad_norm": 0.08859751373529434, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 2958 + }, + { + "epoch": 2.3634185303514377, + "grad_norm": 0.10907880961894989, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 2959 + }, + { + "epoch": 2.364217252396166, + "grad_norm": 0.2368745654821396, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2960 + }, + { + "epoch": 2.3650159744408947, + "grad_norm": 0.15427371859550476, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 2961 + }, + { + "epoch": 2.365814696485623, + "grad_norm": 0.07661470025777817, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 2962 + }, + { + "epoch": 2.3666134185303513, + "grad_norm": 0.2368732988834381, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 2963 + }, + { + "epoch": 2.36741214057508, + "grad_norm": 0.24830125272274017, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2964 + }, + { + "epoch": 2.3682108626198084, + "grad_norm": 0.06940490007400513, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 2965 + }, + { + "epoch": 2.369009584664537, + "grad_norm": 0.18672171235084534, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 2966 + }, + { + "epoch": 2.369808306709265, + "grad_norm": 0.22521120309829712, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 2967 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 0.0496690534055233, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 2968 + }, + { + "epoch": 2.371405750798722, + "grad_norm": 0.16735650599002838, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 2969 + }, + { + "epoch": 2.3722044728434506, + "grad_norm": 0.18583746254444122, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2970 + }, + { + "epoch": 2.373003194888179, + "grad_norm": 0.03828646242618561, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 2971 + }, + { + "epoch": 2.373801916932907, + "grad_norm": 0.14302043616771698, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 2972 + }, + { + "epoch": 2.3746006389776357, + "grad_norm": 0.14217248558998108, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2973 + }, + { + "epoch": 2.3753993610223643, + "grad_norm": 0.08656741678714752, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 2974 + }, + { + "epoch": 2.376198083067093, + "grad_norm": 0.18724001944065094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 2975 + }, + { + "epoch": 2.376996805111821, + "grad_norm": 0.21609556674957275, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 2976 + }, + { + "epoch": 2.3777955271565494, + "grad_norm": 0.08098721504211426, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 2977 + }, + { + "epoch": 2.378594249201278, + "grad_norm": 0.09842941910028458, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 2978 + }, + { + "epoch": 2.3793929712460065, + "grad_norm": 0.14060764014720917, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 2979 + }, + { + "epoch": 2.380191693290735, + "grad_norm": 0.063141830265522, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 2980 + }, + { + "epoch": 2.380990415335463, + "grad_norm": 0.10411619395017624, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 2981 + }, + { + "epoch": 2.3817891373801916, + "grad_norm": 0.15445855259895325, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 2982 + }, + { + "epoch": 2.38258785942492, + "grad_norm": 0.07754000276327133, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 2983 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 0.05312122777104378, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 2984 + }, + { + "epoch": 2.384185303514377, + "grad_norm": 0.09916596859693527, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 2985 + }, + { + "epoch": 2.3849840255591053, + "grad_norm": 0.12749150395393372, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 2986 + }, + { + "epoch": 2.385782747603834, + "grad_norm": 0.054589178413152695, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 2987 + }, + { + "epoch": 2.3865814696485623, + "grad_norm": 0.08480732887983322, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2988 + }, + { + "epoch": 2.387380191693291, + "grad_norm": 0.13158805668354034, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 2989 + }, + { + "epoch": 2.3881789137380194, + "grad_norm": 0.11916540563106537, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 2990 + }, + { + "epoch": 2.3889776357827475, + "grad_norm": 0.05829031020402908, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 2991 + }, + { + "epoch": 2.389776357827476, + "grad_norm": 0.18292354047298431, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 2992 + }, + { + "epoch": 2.3905750798722045, + "grad_norm": 0.18494512140750885, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 2993 + }, + { + "epoch": 2.391373801916933, + "grad_norm": 0.06371760368347168, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 2994 + }, + { + "epoch": 2.392172523961661, + "grad_norm": 0.10157672315835953, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 2995 + }, + { + "epoch": 2.3929712460063897, + "grad_norm": 0.13981172442436218, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 2996 + }, + { + "epoch": 2.393769968051118, + "grad_norm": 0.07794835418462753, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 2997 + }, + { + "epoch": 2.3945686900958467, + "grad_norm": 0.038293492048978806, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 2998 + }, + { + "epoch": 2.3953674121405752, + "grad_norm": 0.06315408647060394, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 2999 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 0.045907966792583466, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3000 + }, + { + "epoch": 2.396964856230032, + "grad_norm": 0.038717497140169144, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3001 + }, + { + "epoch": 2.3977635782747604, + "grad_norm": 0.0376095287501812, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3002 + }, + { + "epoch": 2.398562300319489, + "grad_norm": 0.05739009007811546, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3003 + }, + { + "epoch": 2.3993610223642174, + "grad_norm": 0.034832656383514404, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3004 + }, + { + "epoch": 2.4001597444089455, + "grad_norm": 0.06432276219129562, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3005 + }, + { + "epoch": 2.400958466453674, + "grad_norm": 0.05443817004561424, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3006 + }, + { + "epoch": 2.4017571884984026, + "grad_norm": 0.04691087454557419, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3007 + }, + { + "epoch": 2.402555910543131, + "grad_norm": 0.04394471272826195, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3008 + }, + { + "epoch": 2.4033546325878596, + "grad_norm": 0.03642019256949425, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3009 + }, + { + "epoch": 2.4041533546325877, + "grad_norm": 0.05891808122396469, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3010 + }, + { + "epoch": 2.4049520766773163, + "grad_norm": 0.04530616104602814, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3011 + }, + { + "epoch": 2.405750798722045, + "grad_norm": 0.0518258772790432, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3012 + }, + { + "epoch": 2.4065495207667733, + "grad_norm": 0.11279664188623428, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3013 + }, + { + "epoch": 2.4073482428115014, + "grad_norm": 0.10047753900289536, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3014 + }, + { + "epoch": 2.40814696485623, + "grad_norm": 0.06645897775888443, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3015 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 0.03372915834188461, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3016 + }, + { + "epoch": 2.409744408945687, + "grad_norm": 0.05353475734591484, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3017 + }, + { + "epoch": 2.4105431309904155, + "grad_norm": 0.038493942469358444, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3018 + }, + { + "epoch": 2.4113418530351436, + "grad_norm": 0.07303082197904587, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3019 + }, + { + "epoch": 2.412140575079872, + "grad_norm": 0.043219298124313354, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3020 + }, + { + "epoch": 2.4129392971246006, + "grad_norm": 0.05016458407044411, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3021 + }, + { + "epoch": 2.413738019169329, + "grad_norm": 0.08490880578756332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3022 + }, + { + "epoch": 2.4145367412140573, + "grad_norm": 0.07245411723852158, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3023 + }, + { + "epoch": 2.415335463258786, + "grad_norm": 0.052343063056468964, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3024 + }, + { + "epoch": 2.4161341853035143, + "grad_norm": 0.13449524343013763, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3025 + }, + { + "epoch": 2.416932907348243, + "grad_norm": 0.13177144527435303, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3026 + }, + { + "epoch": 2.4177316293929714, + "grad_norm": 0.06579594314098358, + "learning_rate": 0.0005, + "loss": 1.0372, + "step": 3027 + }, + { + "epoch": 2.4185303514377, + "grad_norm": 0.12716646492481232, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3028 + }, + { + "epoch": 2.419329073482428, + "grad_norm": 0.20006005465984344, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3029 + }, + { + "epoch": 2.4201277955271565, + "grad_norm": 0.16598355770111084, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3030 + }, + { + "epoch": 2.420926517571885, + "grad_norm": 0.06625109165906906, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3031 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.10521841049194336, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3032 + }, + { + "epoch": 2.4225239616613417, + "grad_norm": 0.14134426414966583, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3033 + }, + { + "epoch": 2.42332268370607, + "grad_norm": 0.056669678539037704, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3034 + }, + { + "epoch": 2.4241214057507987, + "grad_norm": 0.052738044410943985, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3035 + }, + { + "epoch": 2.4249201277955272, + "grad_norm": 0.06623729318380356, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3036 + }, + { + "epoch": 2.4257188498402558, + "grad_norm": 0.04038512706756592, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3037 + }, + { + "epoch": 2.426517571884984, + "grad_norm": 0.057600609958171844, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3038 + }, + { + "epoch": 2.4273162939297124, + "grad_norm": 0.08174199610948563, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3039 + }, + { + "epoch": 2.428115015974441, + "grad_norm": 0.07850457727909088, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3040 + }, + { + "epoch": 2.4289137380191694, + "grad_norm": 0.04368523135781288, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3041 + }, + { + "epoch": 2.4297124600638975, + "grad_norm": 0.11637478321790695, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3042 + }, + { + "epoch": 2.430511182108626, + "grad_norm": 0.09765078872442245, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3043 + }, + { + "epoch": 2.4313099041533546, + "grad_norm": 0.04842933267354965, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3044 + }, + { + "epoch": 2.432108626198083, + "grad_norm": 0.08858928829431534, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3045 + }, + { + "epoch": 2.4329073482428116, + "grad_norm": 0.12645326554775238, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3046 + }, + { + "epoch": 2.43370607028754, + "grad_norm": 0.09839878976345062, + "learning_rate": 0.0005, + "loss": 1.0337, + "step": 3047 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 0.04484904557466507, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3048 + }, + { + "epoch": 2.4353035143769968, + "grad_norm": 0.13912586867809296, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 3049 + }, + { + "epoch": 2.4361022364217253, + "grad_norm": 0.18569444119930267, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3050 + }, + { + "epoch": 2.436900958466454, + "grad_norm": 0.13544169068336487, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3051 + }, + { + "epoch": 2.437699680511182, + "grad_norm": 0.04663483425974846, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3052 + }, + { + "epoch": 2.4384984025559104, + "grad_norm": 0.11609578132629395, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3053 + }, + { + "epoch": 2.439297124600639, + "grad_norm": 0.17497499287128448, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3054 + }, + { + "epoch": 2.4400958466453675, + "grad_norm": 0.19216352701187134, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3055 + }, + { + "epoch": 2.440894568690096, + "grad_norm": 0.11638841032981873, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3056 + }, + { + "epoch": 2.441693290734824, + "grad_norm": 0.05816149711608887, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3057 + }, + { + "epoch": 2.4424920127795526, + "grad_norm": 0.1650087982416153, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3058 + }, + { + "epoch": 2.443290734824281, + "grad_norm": 0.2105383425951004, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3059 + }, + { + "epoch": 2.4440894568690097, + "grad_norm": 0.133597731590271, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3060 + }, + { + "epoch": 2.4448881789137378, + "grad_norm": 0.03882076218724251, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3061 + }, + { + "epoch": 2.4456869009584663, + "grad_norm": 0.08914566785097122, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3062 + }, + { + "epoch": 2.446485623003195, + "grad_norm": 0.08115291595458984, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3063 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 0.0402134470641613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3064 + }, + { + "epoch": 2.448083067092652, + "grad_norm": 0.12838906049728394, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3065 + }, + { + "epoch": 2.4488817891373804, + "grad_norm": 0.1865018606185913, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3066 + }, + { + "epoch": 2.4496805111821085, + "grad_norm": 0.13134929537773132, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3067 + }, + { + "epoch": 2.450479233226837, + "grad_norm": 0.05415928363800049, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3068 + }, + { + "epoch": 2.4512779552715656, + "grad_norm": 0.0739838033914566, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3069 + }, + { + "epoch": 2.452076677316294, + "grad_norm": 0.07965957373380661, + "learning_rate": 0.0005, + "loss": 1.0354, + "step": 3070 + }, + { + "epoch": 2.452875399361022, + "grad_norm": 0.0416380800306797, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3071 + }, + { + "epoch": 2.4536741214057507, + "grad_norm": 0.03494519367814064, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3072 + }, + { + "epoch": 2.4544728434504792, + "grad_norm": 0.050772733986377716, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3073 + }, + { + "epoch": 2.4552715654952078, + "grad_norm": 0.03939373791217804, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3074 + }, + { + "epoch": 2.4560702875399363, + "grad_norm": 0.11769624799489975, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3075 + }, + { + "epoch": 2.4568690095846644, + "grad_norm": 0.33884114027023315, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3076 + }, + { + "epoch": 2.457667731629393, + "grad_norm": 0.07171089947223663, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3077 + }, + { + "epoch": 2.4584664536741214, + "grad_norm": 0.0707232877612114, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3078 + }, + { + "epoch": 2.45926517571885, + "grad_norm": 0.14245279133319855, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3079 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.12356095761060715, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3080 + }, + { + "epoch": 2.4608626198083066, + "grad_norm": 0.0694037601351738, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3081 + }, + { + "epoch": 2.461661341853035, + "grad_norm": 0.0511220321059227, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3082 + }, + { + "epoch": 2.4624600638977636, + "grad_norm": 0.10915348678827286, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3083 + }, + { + "epoch": 2.463258785942492, + "grad_norm": 0.10797106474637985, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3084 + }, + { + "epoch": 2.4640575079872207, + "grad_norm": 0.05721200630068779, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3085 + }, + { + "epoch": 2.4648562300319488, + "grad_norm": 0.04477681592106819, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3086 + }, + { + "epoch": 2.4656549520766773, + "grad_norm": 0.08826448023319244, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3087 + }, + { + "epoch": 2.466453674121406, + "grad_norm": 0.1024692952632904, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3088 + }, + { + "epoch": 2.4672523961661343, + "grad_norm": 0.06543146073818207, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3089 + }, + { + "epoch": 2.4680511182108624, + "grad_norm": 0.06146182119846344, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3090 + }, + { + "epoch": 2.468849840255591, + "grad_norm": 0.12857408821582794, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3091 + }, + { + "epoch": 2.4696485623003195, + "grad_norm": 0.12273124605417252, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3092 + }, + { + "epoch": 2.470447284345048, + "grad_norm": 0.06467662751674652, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3093 + }, + { + "epoch": 2.4712460063897765, + "grad_norm": 0.07181179523468018, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3094 + }, + { + "epoch": 2.4720447284345046, + "grad_norm": 0.20223456621170044, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3095 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 0.25061357021331787, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3096 + }, + { + "epoch": 2.4736421725239617, + "grad_norm": 0.16317492723464966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3097 + }, + { + "epoch": 2.47444089456869, + "grad_norm": 0.04005994647741318, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3098 + }, + { + "epoch": 2.4752396166134183, + "grad_norm": 0.15954583883285522, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3099 + }, + { + "epoch": 2.476038338658147, + "grad_norm": 0.2088920623064041, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3100 + }, + { + "epoch": 2.4768370607028753, + "grad_norm": 0.11643055826425552, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3101 + }, + { + "epoch": 2.477635782747604, + "grad_norm": 0.11083687841892242, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3102 + }, + { + "epoch": 2.4784345047923324, + "grad_norm": 0.24777425825595856, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3103 + }, + { + "epoch": 2.479233226837061, + "grad_norm": 0.19513146579265594, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3104 + }, + { + "epoch": 2.480031948881789, + "grad_norm": 0.05009200796484947, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3105 + }, + { + "epoch": 2.4808306709265175, + "grad_norm": 0.2673046588897705, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3106 + }, + { + "epoch": 2.481629392971246, + "grad_norm": 0.3035629093647003, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3107 + }, + { + "epoch": 2.4824281150159746, + "grad_norm": 0.13213352859020233, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3108 + }, + { + "epoch": 2.4832268370607027, + "grad_norm": 0.13605083525180817, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3109 + }, + { + "epoch": 2.484025559105431, + "grad_norm": 0.2958623170852661, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3110 + }, + { + "epoch": 2.4848242811501597, + "grad_norm": 0.23080390691757202, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3111 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 0.046950701624155045, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3112 + }, + { + "epoch": 2.486421725239617, + "grad_norm": 0.24903765320777893, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3113 + }, + { + "epoch": 2.487220447284345, + "grad_norm": 0.233968585729599, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3114 + }, + { + "epoch": 2.4880191693290734, + "grad_norm": 0.04709520563483238, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3115 + }, + { + "epoch": 2.488817891373802, + "grad_norm": 0.16599629819393158, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3116 + }, + { + "epoch": 2.4896166134185305, + "grad_norm": 0.19273866713047028, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3117 + }, + { + "epoch": 2.4904153354632586, + "grad_norm": 0.11514598876237869, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3118 + }, + { + "epoch": 2.491214057507987, + "grad_norm": 0.08656881004571915, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 3119 + }, + { + "epoch": 2.4920127795527156, + "grad_norm": 0.18213899433612823, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3120 + }, + { + "epoch": 2.492811501597444, + "grad_norm": 0.11029175668954849, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3121 + }, + { + "epoch": 2.4936102236421727, + "grad_norm": 0.04480903223156929, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3122 + }, + { + "epoch": 2.494408945686901, + "grad_norm": 0.04919225722551346, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3123 + }, + { + "epoch": 2.4952076677316293, + "grad_norm": 0.06349056959152222, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3124 + }, + { + "epoch": 2.496006389776358, + "grad_norm": 0.04066464304924011, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3125 + }, + { + "epoch": 2.4968051118210863, + "grad_norm": 0.03992457687854767, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3126 + }, + { + "epoch": 2.497603833865815, + "grad_norm": 0.04580394923686981, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3127 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.13679265975952148, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3128 + }, + { + "epoch": 2.4992012779552715, + "grad_norm": 0.20708884298801422, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3129 + }, + { + "epoch": 2.5, + "grad_norm": 0.22991639375686646, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3130 + }, + { + "epoch": 2.5007987220447285, + "grad_norm": 0.15380895137786865, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3131 + }, + { + "epoch": 2.501597444089457, + "grad_norm": 0.05112789571285248, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3132 + }, + { + "epoch": 2.502396166134185, + "grad_norm": 0.19797906279563904, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3133 + }, + { + "epoch": 2.5031948881789137, + "grad_norm": 0.18190141022205353, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3134 + }, + { + "epoch": 2.503993610223642, + "grad_norm": 0.04291468858718872, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3135 + }, + { + "epoch": 2.5047923322683707, + "grad_norm": 0.14576731622219086, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3136 + }, + { + "epoch": 2.505591054313099, + "grad_norm": 0.25093281269073486, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3137 + }, + { + "epoch": 2.5063897763578273, + "grad_norm": 0.22738556563854218, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3138 + }, + { + "epoch": 2.507188498402556, + "grad_norm": 0.08985915035009384, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3139 + }, + { + "epoch": 2.5079872204472844, + "grad_norm": 0.09632397443056107, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3140 + }, + { + "epoch": 2.508785942492013, + "grad_norm": 0.12138333916664124, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3141 + }, + { + "epoch": 2.5095846645367414, + "grad_norm": 0.04163306951522827, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3142 + }, + { + "epoch": 2.5103833865814695, + "grad_norm": 0.06187185272574425, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3143 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.09463546425104141, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3144 + }, + { + "epoch": 2.5119808306709266, + "grad_norm": 0.12386980652809143, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3145 + }, + { + "epoch": 2.512779552715655, + "grad_norm": 0.07090163975954056, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3146 + }, + { + "epoch": 2.513578274760383, + "grad_norm": 0.04502219334244728, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3147 + }, + { + "epoch": 2.5143769968051117, + "grad_norm": 0.08453603833913803, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3148 + }, + { + "epoch": 2.5151757188498403, + "grad_norm": 0.08686821907758713, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3149 + }, + { + "epoch": 2.515974440894569, + "grad_norm": 0.03968734294176102, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3150 + }, + { + "epoch": 2.5167731629392973, + "grad_norm": 0.08613990992307663, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3151 + }, + { + "epoch": 2.5175718849840254, + "grad_norm": 0.07950794696807861, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3152 + }, + { + "epoch": 2.518370607028754, + "grad_norm": 0.0449741929769516, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3153 + }, + { + "epoch": 2.5191693290734825, + "grad_norm": 0.09032034873962402, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3154 + }, + { + "epoch": 2.519968051118211, + "grad_norm": 0.06834430247545242, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3155 + }, + { + "epoch": 2.520766773162939, + "grad_norm": 0.13820379972457886, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3156 + }, + { + "epoch": 2.5215654952076676, + "grad_norm": 0.17753586173057556, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3157 + }, + { + "epoch": 2.522364217252396, + "grad_norm": 0.2663286626338959, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3158 + }, + { + "epoch": 2.5231629392971247, + "grad_norm": 0.21509577333927155, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3159 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.04614022746682167, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3160 + }, + { + "epoch": 2.5247603833865817, + "grad_norm": 0.13719527423381805, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3161 + }, + { + "epoch": 2.52555910543131, + "grad_norm": 0.20119087398052216, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3162 + }, + { + "epoch": 2.5263578274760383, + "grad_norm": 0.1822054237127304, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3163 + }, + { + "epoch": 2.527156549520767, + "grad_norm": 0.06550543755292892, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3164 + }, + { + "epoch": 2.527955271565495, + "grad_norm": 0.08079471439123154, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3165 + }, + { + "epoch": 2.5287539936102235, + "grad_norm": 0.10106988251209259, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3166 + }, + { + "epoch": 2.529552715654952, + "grad_norm": 0.06818784028291702, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3167 + }, + { + "epoch": 2.5303514376996805, + "grad_norm": 0.05976718291640282, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3168 + }, + { + "epoch": 2.531150159744409, + "grad_norm": 0.18163853883743286, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3169 + }, + { + "epoch": 2.5319488817891376, + "grad_norm": 0.26418858766555786, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3170 + }, + { + "epoch": 2.5327476038338657, + "grad_norm": 0.24044150114059448, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3171 + }, + { + "epoch": 2.533546325878594, + "grad_norm": 0.07499254494905472, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3172 + }, + { + "epoch": 2.5343450479233227, + "grad_norm": 0.17483314871788025, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3173 + }, + { + "epoch": 2.5351437699680512, + "grad_norm": 0.2698160707950592, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3174 + }, + { + "epoch": 2.5359424920127793, + "grad_norm": 0.2116270661354065, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3175 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.0545198880136013, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3176 + }, + { + "epoch": 2.5375399361022364, + "grad_norm": 0.1926649659872055, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3177 + }, + { + "epoch": 2.538338658146965, + "grad_norm": 0.24152790009975433, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3178 + }, + { + "epoch": 2.5391373801916934, + "grad_norm": 0.12380969524383545, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3179 + }, + { + "epoch": 2.539936102236422, + "grad_norm": 0.07934054732322693, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3180 + }, + { + "epoch": 2.54073482428115, + "grad_norm": 0.13688413798809052, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3181 + }, + { + "epoch": 2.5415335463258786, + "grad_norm": 0.05832000821828842, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 3182 + }, + { + "epoch": 2.542332268370607, + "grad_norm": 0.08729993551969528, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3183 + }, + { + "epoch": 2.543130990415335, + "grad_norm": 0.16843630373477936, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3184 + }, + { + "epoch": 2.5439297124600637, + "grad_norm": 0.13045506179332733, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3185 + }, + { + "epoch": 2.5447284345047922, + "grad_norm": 0.038882140070199966, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3186 + }, + { + "epoch": 2.5455271565495208, + "grad_norm": 0.14922545850276947, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3187 + }, + { + "epoch": 2.5463258785942493, + "grad_norm": 0.1961440145969391, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3188 + }, + { + "epoch": 2.547124600638978, + "grad_norm": 0.08585302531719208, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3189 + }, + { + "epoch": 2.547923322683706, + "grad_norm": 0.13141697645187378, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3190 + }, + { + "epoch": 2.5487220447284344, + "grad_norm": 0.20332233607769012, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3191 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.1740144044160843, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3192 + }, + { + "epoch": 2.5503194888178915, + "grad_norm": 0.04738207906484604, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3193 + }, + { + "epoch": 2.5511182108626196, + "grad_norm": 0.23204317688941956, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3194 + }, + { + "epoch": 2.551916932907348, + "grad_norm": 0.29033714532852173, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3195 + }, + { + "epoch": 2.5527156549520766, + "grad_norm": 0.1251334547996521, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3196 + }, + { + "epoch": 2.553514376996805, + "grad_norm": 0.1610727608203888, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3197 + }, + { + "epoch": 2.5543130990415337, + "grad_norm": 0.284105509519577, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3198 + }, + { + "epoch": 2.5551118210862622, + "grad_norm": 0.1530643254518509, + "learning_rate": 0.0005, + "loss": 1.0324, + "step": 3199 + }, + { + "epoch": 2.5559105431309903, + "grad_norm": 0.07761498540639877, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3200 + }, + { + "epoch": 2.556709265175719, + "grad_norm": 0.16693277657032013, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3201 + }, + { + "epoch": 2.5575079872204474, + "grad_norm": 0.06345608085393906, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3202 + }, + { + "epoch": 2.5583067092651754, + "grad_norm": 0.10956210643053055, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3203 + }, + { + "epoch": 2.559105431309904, + "grad_norm": 0.17655007541179657, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3204 + }, + { + "epoch": 2.5599041533546325, + "grad_norm": 0.12615050375461578, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3205 + }, + { + "epoch": 2.560702875399361, + "grad_norm": 0.049671441316604614, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3206 + }, + { + "epoch": 2.5615015974440896, + "grad_norm": 0.16559815406799316, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3207 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 0.1279190182685852, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3208 + }, + { + "epoch": 2.563099041533546, + "grad_norm": 0.0540652722120285, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3209 + }, + { + "epoch": 2.5638977635782747, + "grad_norm": 0.1287074238061905, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 3210 + }, + { + "epoch": 2.5646964856230032, + "grad_norm": 0.1118067055940628, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3211 + }, + { + "epoch": 2.5654952076677318, + "grad_norm": 0.05159451439976692, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3212 + }, + { + "epoch": 2.56629392971246, + "grad_norm": 0.10654652118682861, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3213 + }, + { + "epoch": 2.5670926517571884, + "grad_norm": 0.15669982135295868, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3214 + }, + { + "epoch": 2.567891373801917, + "grad_norm": 0.11388157308101654, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3215 + }, + { + "epoch": 2.5686900958466454, + "grad_norm": 0.06434119492769241, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3216 + }, + { + "epoch": 2.569488817891374, + "grad_norm": 0.050070468336343765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3217 + }, + { + "epoch": 2.5702875399361025, + "grad_norm": 0.0522335022687912, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3218 + }, + { + "epoch": 2.5710862619808306, + "grad_norm": 0.04716494306921959, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3219 + }, + { + "epoch": 2.571884984025559, + "grad_norm": 0.03770711272954941, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3220 + }, + { + "epoch": 2.5726837060702876, + "grad_norm": 0.03955485299229622, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3221 + }, + { + "epoch": 2.5734824281150157, + "grad_norm": 0.03824841231107712, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3222 + }, + { + "epoch": 2.5742811501597442, + "grad_norm": 0.04722970351576805, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3223 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 0.05470758676528931, + "learning_rate": 0.0005, + "loss": 1.0371, + "step": 3224 + }, + { + "epoch": 2.5758785942492013, + "grad_norm": 0.04934269189834595, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3225 + }, + { + "epoch": 2.57667731629393, + "grad_norm": 0.040627289563417435, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3226 + }, + { + "epoch": 2.5774760383386583, + "grad_norm": 0.05668056383728981, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3227 + }, + { + "epoch": 2.5782747603833864, + "grad_norm": 0.11724753677845001, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3228 + }, + { + "epoch": 2.579073482428115, + "grad_norm": 0.12204517424106598, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3229 + }, + { + "epoch": 2.5798722044728435, + "grad_norm": 0.10652083158493042, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3230 + }, + { + "epoch": 2.580670926517572, + "grad_norm": 0.07430299371480942, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3231 + }, + { + "epoch": 2.5814696485623, + "grad_norm": 0.03460770472884178, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3232 + }, + { + "epoch": 2.5822683706070286, + "grad_norm": 0.080150306224823, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3233 + }, + { + "epoch": 2.583067092651757, + "grad_norm": 0.1291198879480362, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3234 + }, + { + "epoch": 2.5838658146964857, + "grad_norm": 0.19541533291339874, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3235 + }, + { + "epoch": 2.584664536741214, + "grad_norm": 0.24089939892292023, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3236 + }, + { + "epoch": 2.5854632587859427, + "grad_norm": 0.1933099627494812, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3237 + }, + { + "epoch": 2.586261980830671, + "grad_norm": 0.07295489311218262, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3238 + }, + { + "epoch": 2.5870607028753994, + "grad_norm": 0.10686071962118149, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3239 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 0.17052637040615082, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3240 + }, + { + "epoch": 2.588658146964856, + "grad_norm": 0.12377535551786423, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3241 + }, + { + "epoch": 2.5894568690095845, + "grad_norm": 0.03730800375342369, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3242 + }, + { + "epoch": 2.590255591054313, + "grad_norm": 0.13848428428173065, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3243 + }, + { + "epoch": 2.5910543130990416, + "grad_norm": 0.18361017107963562, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3244 + }, + { + "epoch": 2.59185303514377, + "grad_norm": 0.11140795797109604, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3245 + }, + { + "epoch": 2.5926517571884986, + "grad_norm": 0.033891428261995316, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3246 + }, + { + "epoch": 2.5934504792332267, + "grad_norm": 0.13179628551006317, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3247 + }, + { + "epoch": 2.594249201277955, + "grad_norm": 0.19785374402999878, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3248 + }, + { + "epoch": 2.5950479233226837, + "grad_norm": 0.15991398692131042, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3249 + }, + { + "epoch": 2.5958466453674123, + "grad_norm": 0.0702645480632782, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3250 + }, + { + "epoch": 2.5966453674121404, + "grad_norm": 0.038220152258872986, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3251 + }, + { + "epoch": 2.597444089456869, + "grad_norm": 0.048042308539152145, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3252 + }, + { + "epoch": 2.5982428115015974, + "grad_norm": 0.05673132464289665, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3253 + }, + { + "epoch": 2.599041533546326, + "grad_norm": 0.057284750044345856, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3254 + }, + { + "epoch": 2.5998402555910545, + "grad_norm": 0.052904874086380005, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3255 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.04914860427379608, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3256 + }, + { + "epoch": 2.601437699680511, + "grad_norm": 0.08870472013950348, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3257 + }, + { + "epoch": 2.6022364217252396, + "grad_norm": 0.09863728284835815, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3258 + }, + { + "epoch": 2.603035143769968, + "grad_norm": 0.08116353303194046, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3259 + }, + { + "epoch": 2.6038338658146962, + "grad_norm": 0.043653007596731186, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3260 + }, + { + "epoch": 2.6046325878594248, + "grad_norm": 0.0579618401825428, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3261 + }, + { + "epoch": 2.6054313099041533, + "grad_norm": 0.08072935789823532, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3262 + }, + { + "epoch": 2.606230031948882, + "grad_norm": 0.05391686409711838, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3263 + }, + { + "epoch": 2.6070287539936103, + "grad_norm": 0.03471128270030022, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3264 + }, + { + "epoch": 2.607827476038339, + "grad_norm": 0.056328870356082916, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3265 + }, + { + "epoch": 2.608626198083067, + "grad_norm": 0.05196002125740051, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3266 + }, + { + "epoch": 2.6094249201277955, + "grad_norm": 0.04338999465107918, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3267 + }, + { + "epoch": 2.610223642172524, + "grad_norm": 0.12365762889385223, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3268 + }, + { + "epoch": 2.6110223642172525, + "grad_norm": 0.19469699263572693, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3269 + }, + { + "epoch": 2.6118210862619806, + "grad_norm": 0.1825639009475708, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3270 + }, + { + "epoch": 2.612619808306709, + "grad_norm": 0.10235249251127243, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3271 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 0.05571124702692032, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3272 + }, + { + "epoch": 2.614217252396166, + "grad_norm": 0.1536952704191208, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3273 + }, + { + "epoch": 2.6150159744408947, + "grad_norm": 0.163212850689888, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3274 + }, + { + "epoch": 2.6158146964856233, + "grad_norm": 0.09640593826770782, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3275 + }, + { + "epoch": 2.6166134185303513, + "grad_norm": 0.04329126700758934, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3276 + }, + { + "epoch": 2.61741214057508, + "grad_norm": 0.03598733991384506, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3277 + }, + { + "epoch": 2.6182108626198084, + "grad_norm": 0.046664439141750336, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3278 + }, + { + "epoch": 2.6190095846645365, + "grad_norm": 0.03692904859781265, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3279 + }, + { + "epoch": 2.619808306709265, + "grad_norm": 0.0482964888215065, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3280 + }, + { + "epoch": 2.6206070287539935, + "grad_norm": 0.07996834069490433, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3281 + }, + { + "epoch": 2.621405750798722, + "grad_norm": 0.060141101479530334, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3282 + }, + { + "epoch": 2.6222044728434506, + "grad_norm": 0.04013051837682724, + "learning_rate": 0.0005, + "loss": 1.0328, + "step": 3283 + }, + { + "epoch": 2.623003194888179, + "grad_norm": 0.04011296480894089, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 3284 + }, + { + "epoch": 2.623801916932907, + "grad_norm": 0.04112064838409424, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3285 + }, + { + "epoch": 2.6246006389776357, + "grad_norm": 0.057281915098428726, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3286 + }, + { + "epoch": 2.6253993610223643, + "grad_norm": 0.06061771139502525, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3287 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 0.05844549089670181, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3288 + }, + { + "epoch": 2.626996805111821, + "grad_norm": 0.06354600191116333, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3289 + }, + { + "epoch": 2.6277955271565494, + "grad_norm": 0.04568248987197876, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3290 + }, + { + "epoch": 2.628594249201278, + "grad_norm": 0.04340318217873573, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3291 + }, + { + "epoch": 2.6293929712460065, + "grad_norm": 0.07078617066144943, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3292 + }, + { + "epoch": 2.630191693290735, + "grad_norm": 0.09865503013134003, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3293 + }, + { + "epoch": 2.6309904153354635, + "grad_norm": 0.08623871207237244, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3294 + }, + { + "epoch": 2.6317891373801916, + "grad_norm": 0.03787717968225479, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3295 + }, + { + "epoch": 2.63258785942492, + "grad_norm": 0.14653000235557556, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3296 + }, + { + "epoch": 2.6333865814696487, + "grad_norm": 0.2749452292919159, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3297 + }, + { + "epoch": 2.6341853035143767, + "grad_norm": 0.28424543142318726, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3298 + }, + { + "epoch": 2.6349840255591053, + "grad_norm": 0.17354224622249603, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3299 + }, + { + "epoch": 2.635782747603834, + "grad_norm": 0.04208464175462723, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3300 + }, + { + "epoch": 2.6365814696485623, + "grad_norm": 0.15522420406341553, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3301 + }, + { + "epoch": 2.637380191693291, + "grad_norm": 0.17986370623111725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3302 + }, + { + "epoch": 2.6381789137380194, + "grad_norm": 0.07155515998601913, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3303 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.11287503689527512, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3304 + }, + { + "epoch": 2.639776357827476, + "grad_norm": 0.22735139727592468, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3305 + }, + { + "epoch": 2.6405750798722045, + "grad_norm": 0.23528814315795898, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3306 + }, + { + "epoch": 2.641373801916933, + "grad_norm": 0.13828198611736298, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3307 + }, + { + "epoch": 2.642172523961661, + "grad_norm": 0.046783462166786194, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3308 + }, + { + "epoch": 2.6429712460063897, + "grad_norm": 0.13010001182556152, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3309 + }, + { + "epoch": 2.643769968051118, + "grad_norm": 0.12339942902326584, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3310 + }, + { + "epoch": 2.6445686900958467, + "grad_norm": 0.06443019211292267, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3311 + }, + { + "epoch": 2.6453674121405752, + "grad_norm": 0.05086766183376312, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3312 + }, + { + "epoch": 2.6461661341853038, + "grad_norm": 0.1266956627368927, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3313 + }, + { + "epoch": 2.646964856230032, + "grad_norm": 0.1238899901509285, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3314 + }, + { + "epoch": 2.6477635782747604, + "grad_norm": 0.07378736138343811, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3315 + }, + { + "epoch": 2.648562300319489, + "grad_norm": 0.12572194635868073, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3316 + }, + { + "epoch": 2.649361022364217, + "grad_norm": 0.18099260330200195, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3317 + }, + { + "epoch": 2.6501597444089455, + "grad_norm": 0.1383541077375412, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3318 + }, + { + "epoch": 2.650958466453674, + "grad_norm": 0.043900374323129654, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3319 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 0.13228318095207214, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3320 + }, + { + "epoch": 2.652555910543131, + "grad_norm": 0.11684399843215942, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3321 + }, + { + "epoch": 2.6533546325878596, + "grad_norm": 0.03879965469241142, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3322 + }, + { + "epoch": 2.6541533546325877, + "grad_norm": 0.1457953006029129, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3323 + }, + { + "epoch": 2.6549520766773163, + "grad_norm": 0.21643802523612976, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3324 + }, + { + "epoch": 2.655750798722045, + "grad_norm": 0.20250067114830017, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3325 + }, + { + "epoch": 2.6565495207667733, + "grad_norm": 0.09131773561239243, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3326 + }, + { + "epoch": 2.6573482428115014, + "grad_norm": 0.07217761129140854, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3327 + }, + { + "epoch": 2.65814696485623, + "grad_norm": 0.13251517713069916, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3328 + }, + { + "epoch": 2.6589456869009584, + "grad_norm": 0.09462655335664749, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3329 + }, + { + "epoch": 2.659744408945687, + "grad_norm": 0.04496161639690399, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3330 + }, + { + "epoch": 2.6605431309904155, + "grad_norm": 0.13246162235736847, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3331 + }, + { + "epoch": 2.661341853035144, + "grad_norm": 0.1548391878604889, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3332 + }, + { + "epoch": 2.662140575079872, + "grad_norm": 0.09438800066709518, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3333 + }, + { + "epoch": 2.6629392971246006, + "grad_norm": 0.033411599695682526, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3334 + }, + { + "epoch": 2.663738019169329, + "grad_norm": 0.04015564173460007, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3335 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 0.033046361058950424, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3336 + }, + { + "epoch": 2.665335463258786, + "grad_norm": 0.04766019433736801, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3337 + }, + { + "epoch": 2.6661341853035143, + "grad_norm": 0.06365641951560974, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3338 + }, + { + "epoch": 2.666932907348243, + "grad_norm": 0.03329809010028839, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3339 + }, + { + "epoch": 2.6677316293929714, + "grad_norm": 0.10063061863183975, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3340 + }, + { + "epoch": 2.6685303514377, + "grad_norm": 0.16541579365730286, + "learning_rate": 0.0005, + "loss": 1.0364, + "step": 3341 + }, + { + "epoch": 2.669329073482428, + "grad_norm": 0.18877379596233368, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3342 + }, + { + "epoch": 2.6701277955271565, + "grad_norm": 0.12577234208583832, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3343 + }, + { + "epoch": 2.670926517571885, + "grad_norm": 0.04403039440512657, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3344 + }, + { + "epoch": 2.6717252396166136, + "grad_norm": 0.172403946518898, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3345 + }, + { + "epoch": 2.6725239616613417, + "grad_norm": 0.2147791087627411, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3346 + }, + { + "epoch": 2.67332268370607, + "grad_norm": 0.1536005735397339, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3347 + }, + { + "epoch": 2.6741214057507987, + "grad_norm": 0.061038631945848465, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3348 + }, + { + "epoch": 2.6749201277955272, + "grad_norm": 0.03402748703956604, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3349 + }, + { + "epoch": 2.6757188498402558, + "grad_norm": 0.05285736918449402, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3350 + }, + { + "epoch": 2.6765175718849843, + "grad_norm": 0.0807662233710289, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3351 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.057097889482975006, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3352 + }, + { + "epoch": 2.678115015974441, + "grad_norm": 0.06845760345458984, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3353 + }, + { + "epoch": 2.6789137380191694, + "grad_norm": 0.1209796816110611, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3354 + }, + { + "epoch": 2.6797124600638975, + "grad_norm": 0.09372428804636002, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3355 + }, + { + "epoch": 2.680511182108626, + "grad_norm": 0.03795485943555832, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3356 + }, + { + "epoch": 2.6813099041533546, + "grad_norm": 0.14420334994792938, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3357 + }, + { + "epoch": 2.682108626198083, + "grad_norm": 0.23049019277095795, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3358 + }, + { + "epoch": 2.6829073482428116, + "grad_norm": 0.21722057461738586, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3359 + }, + { + "epoch": 2.68370607028754, + "grad_norm": 0.0968366488814354, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3360 + }, + { + "epoch": 2.6845047923322682, + "grad_norm": 0.10279416292905807, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 3361 + }, + { + "epoch": 2.6853035143769968, + "grad_norm": 0.2077404409646988, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3362 + }, + { + "epoch": 2.6861022364217253, + "grad_norm": 0.14186711609363556, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3363 + }, + { + "epoch": 2.686900958466454, + "grad_norm": 0.04573604837059975, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3364 + }, + { + "epoch": 2.687699680511182, + "grad_norm": 0.13861627876758575, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3365 + }, + { + "epoch": 2.6884984025559104, + "grad_norm": 0.17746120691299438, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3366 + }, + { + "epoch": 2.689297124600639, + "grad_norm": 0.15865683555603027, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3367 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.05537402629852295, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3368 + }, + { + "epoch": 2.690894568690096, + "grad_norm": 0.064423106610775, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3369 + }, + { + "epoch": 2.6916932907348246, + "grad_norm": 0.0922585278749466, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3370 + }, + { + "epoch": 2.6924920127795526, + "grad_norm": 0.08034171909093857, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3371 + }, + { + "epoch": 2.693290734824281, + "grad_norm": 0.05695292726159096, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 3372 + }, + { + "epoch": 2.6940894568690097, + "grad_norm": 0.04140406847000122, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3373 + }, + { + "epoch": 2.6948881789137378, + "grad_norm": 0.038130711764097214, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3374 + }, + { + "epoch": 2.6956869009584663, + "grad_norm": 0.07363594323396683, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3375 + }, + { + "epoch": 2.696485623003195, + "grad_norm": 0.13670513033866882, + "learning_rate": 0.0005, + "loss": 1.0391, + "step": 3376 + }, + { + "epoch": 2.6972843450479234, + "grad_norm": 0.16614536941051483, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3377 + }, + { + "epoch": 2.698083067092652, + "grad_norm": 0.1346762478351593, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3378 + }, + { + "epoch": 2.6988817891373804, + "grad_norm": 0.06321856379508972, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 3379 + }, + { + "epoch": 2.6996805111821085, + "grad_norm": 0.057517897337675095, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3380 + }, + { + "epoch": 2.700479233226837, + "grad_norm": 0.11995001137256622, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 3381 + }, + { + "epoch": 2.7012779552715656, + "grad_norm": 0.10514877736568451, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3382 + }, + { + "epoch": 2.702076677316294, + "grad_norm": 0.05942686274647713, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3383 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 0.03508206829428673, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3384 + }, + { + "epoch": 2.7036741214057507, + "grad_norm": 0.05182692036032677, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3385 + }, + { + "epoch": 2.7044728434504792, + "grad_norm": 0.0597345344722271, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3386 + }, + { + "epoch": 2.7052715654952078, + "grad_norm": 0.037486087530851364, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3387 + }, + { + "epoch": 2.7060702875399363, + "grad_norm": 0.040483538061380386, + "learning_rate": 0.0005, + "loss": 1.0352, + "step": 3388 + }, + { + "epoch": 2.706869009584665, + "grad_norm": 0.044094670563936234, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3389 + }, + { + "epoch": 2.707667731629393, + "grad_norm": 0.06498228758573532, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3390 + }, + { + "epoch": 2.7084664536741214, + "grad_norm": 0.06955298781394958, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3391 + }, + { + "epoch": 2.70926517571885, + "grad_norm": 0.11691966652870178, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3392 + }, + { + "epoch": 2.710063897763578, + "grad_norm": 0.1183234304189682, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3393 + }, + { + "epoch": 2.7108626198083066, + "grad_norm": 0.08358792215585709, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3394 + }, + { + "epoch": 2.711661341853035, + "grad_norm": 0.04190056398510933, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3395 + }, + { + "epoch": 2.7124600638977636, + "grad_norm": 0.09757649153470993, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3396 + }, + { + "epoch": 2.713258785942492, + "grad_norm": 0.11508934944868088, + "learning_rate": 0.0005, + "loss": 1.0406, + "step": 3397 + }, + { + "epoch": 2.7140575079872207, + "grad_norm": 0.05612087994813919, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3398 + }, + { + "epoch": 2.7148562300319488, + "grad_norm": 0.07044408470392227, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3399 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.07732822746038437, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3400 + }, + { + "epoch": 2.716453674121406, + "grad_norm": 0.054326847195625305, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3401 + }, + { + "epoch": 2.7172523961661343, + "grad_norm": 0.041327398270368576, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3402 + }, + { + "epoch": 2.7180511182108624, + "grad_norm": 0.07147548347711563, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3403 + }, + { + "epoch": 2.718849840255591, + "grad_norm": 0.12999942898750305, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3404 + }, + { + "epoch": 2.7196485623003195, + "grad_norm": 0.18404515087604523, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3405 + }, + { + "epoch": 2.720447284345048, + "grad_norm": 0.1873377114534378, + "learning_rate": 0.0005, + "loss": 1.0407, + "step": 3406 + }, + { + "epoch": 2.7212460063897765, + "grad_norm": 0.0732024610042572, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3407 + }, + { + "epoch": 2.722044728434505, + "grad_norm": 0.07602795958518982, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 3408 + }, + { + "epoch": 2.722843450479233, + "grad_norm": 0.07871323823928833, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3409 + }, + { + "epoch": 2.7236421725239617, + "grad_norm": 0.0738302692770958, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3410 + }, + { + "epoch": 2.72444089456869, + "grad_norm": 0.12097286432981491, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3411 + }, + { + "epoch": 2.7252396166134183, + "grad_norm": 0.10136821120977402, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3412 + }, + { + "epoch": 2.726038338658147, + "grad_norm": 0.07281512022018433, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3413 + }, + { + "epoch": 2.7268370607028753, + "grad_norm": 0.09425969421863556, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3414 + }, + { + "epoch": 2.727635782747604, + "grad_norm": 0.11939436942338943, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3415 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 0.07181181758642197, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3416 + }, + { + "epoch": 2.729233226837061, + "grad_norm": 0.06634730845689774, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3417 + }, + { + "epoch": 2.730031948881789, + "grad_norm": 0.0941692590713501, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3418 + }, + { + "epoch": 2.7308306709265175, + "grad_norm": 0.10803452879190445, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3419 + }, + { + "epoch": 2.731629392971246, + "grad_norm": 0.08289305865764618, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3420 + }, + { + "epoch": 2.7324281150159746, + "grad_norm": 0.048421960324048996, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3421 + }, + { + "epoch": 2.7332268370607027, + "grad_norm": 0.09108635783195496, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3422 + }, + { + "epoch": 2.734025559105431, + "grad_norm": 0.13627508282661438, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3423 + }, + { + "epoch": 2.7348242811501597, + "grad_norm": 0.14651858806610107, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3424 + }, + { + "epoch": 2.7356230031948883, + "grad_norm": 0.126741424202919, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3425 + }, + { + "epoch": 2.736421725239617, + "grad_norm": 0.05885545164346695, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3426 + }, + { + "epoch": 2.737220447284345, + "grad_norm": 0.09471739828586578, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3427 + }, + { + "epoch": 2.7380191693290734, + "grad_norm": 0.18026123940944672, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3428 + }, + { + "epoch": 2.738817891373802, + "grad_norm": 0.1737871915102005, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3429 + }, + { + "epoch": 2.7396166134185305, + "grad_norm": 0.052994512021541595, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3430 + }, + { + "epoch": 2.7404153354632586, + "grad_norm": 0.13484452664852142, + "learning_rate": 0.0005, + "loss": 1.0377, + "step": 3431 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 0.2207227200269699, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3432 + }, + { + "epoch": 2.7420127795527156, + "grad_norm": 0.17741963267326355, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3433 + }, + { + "epoch": 2.742811501597444, + "grad_norm": 0.07451824843883514, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3434 + }, + { + "epoch": 2.7436102236421727, + "grad_norm": 0.07947403192520142, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3435 + }, + { + "epoch": 2.744408945686901, + "grad_norm": 0.11197762936353683, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3436 + }, + { + "epoch": 2.7452076677316293, + "grad_norm": 0.08398377895355225, + "learning_rate": 0.0005, + "loss": 1.0357, + "step": 3437 + }, + { + "epoch": 2.746006389776358, + "grad_norm": 0.03809420019388199, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3438 + }, + { + "epoch": 2.7468051118210863, + "grad_norm": 0.11537694931030273, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3439 + }, + { + "epoch": 2.747603833865815, + "grad_norm": 0.1537221372127533, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 3440 + }, + { + "epoch": 2.748402555910543, + "grad_norm": 0.1132403165102005, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3441 + }, + { + "epoch": 2.7492012779552715, + "grad_norm": 0.038440920412540436, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3442 + }, + { + "epoch": 2.75, + "grad_norm": 0.10132595151662827, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3443 + }, + { + "epoch": 2.7507987220447285, + "grad_norm": 0.12446253001689911, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3444 + }, + { + "epoch": 2.751597444089457, + "grad_norm": 0.05364474281668663, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3445 + }, + { + "epoch": 2.752396166134185, + "grad_norm": 0.04705234244465828, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3446 + }, + { + "epoch": 2.7531948881789137, + "grad_norm": 0.10524975508451462, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3447 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 0.12036000937223434, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3448 + }, + { + "epoch": 2.7547923322683707, + "grad_norm": 0.08042819797992706, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3449 + }, + { + "epoch": 2.755591054313099, + "grad_norm": 0.04404102638363838, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3450 + }, + { + "epoch": 2.7563897763578273, + "grad_norm": 0.0766257792711258, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3451 + }, + { + "epoch": 2.757188498402556, + "grad_norm": 0.06359248608350754, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3452 + }, + { + "epoch": 2.7579872204472844, + "grad_norm": 0.06752901524305344, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3453 + }, + { + "epoch": 2.758785942492013, + "grad_norm": 0.12018375843763351, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3454 + }, + { + "epoch": 2.7595846645367414, + "grad_norm": 0.15904727578163147, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3455 + }, + { + "epoch": 2.7603833865814695, + "grad_norm": 0.12665021419525146, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3456 + }, + { + "epoch": 2.761182108626198, + "grad_norm": 0.07552342861890793, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3457 + }, + { + "epoch": 2.7619808306709266, + "grad_norm": 0.25927653908729553, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3458 + }, + { + "epoch": 2.762779552715655, + "grad_norm": 0.3487590253353119, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3459 + }, + { + "epoch": 2.763578274760383, + "grad_norm": 0.2783665359020233, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3460 + }, + { + "epoch": 2.7643769968051117, + "grad_norm": 0.054424334317445755, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3461 + }, + { + "epoch": 2.7651757188498403, + "grad_norm": 0.240921288728714, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3462 + }, + { + "epoch": 2.765974440894569, + "grad_norm": 0.3380962014198303, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3463 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.1514623463153839, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3464 + }, + { + "epoch": 2.7675718849840254, + "grad_norm": 0.15135464072227478, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3465 + }, + { + "epoch": 2.768370607028754, + "grad_norm": 0.262546181678772, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3466 + }, + { + "epoch": 2.7691693290734825, + "grad_norm": 0.11052273958921432, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3467 + }, + { + "epoch": 2.769968051118211, + "grad_norm": 0.14473804831504822, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3468 + }, + { + "epoch": 2.770766773162939, + "grad_norm": 0.24968142807483673, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3469 + }, + { + "epoch": 2.7715654952076676, + "grad_norm": 0.13638247549533844, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3470 + }, + { + "epoch": 2.772364217252396, + "grad_norm": 0.0957072302699089, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3471 + }, + { + "epoch": 2.7731629392971247, + "grad_norm": 0.2122000902891159, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3472 + }, + { + "epoch": 2.773961661341853, + "grad_norm": 0.15716226398944855, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3473 + }, + { + "epoch": 2.7747603833865817, + "grad_norm": 0.05107169970870018, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3474 + }, + { + "epoch": 2.77555910543131, + "grad_norm": 0.19824674725532532, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3475 + }, + { + "epoch": 2.7763578274760383, + "grad_norm": 0.16866235435009003, + "learning_rate": 0.0005, + "loss": 1.0349, + "step": 3476 + }, + { + "epoch": 2.777156549520767, + "grad_norm": 0.03332412987947464, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3477 + }, + { + "epoch": 2.777955271565495, + "grad_norm": 0.1771237850189209, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3478 + }, + { + "epoch": 2.7787539936102235, + "grad_norm": 0.23501509428024292, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3479 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.0976579561829567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3480 + }, + { + "epoch": 2.7803514376996805, + "grad_norm": 0.11640458554029465, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3481 + }, + { + "epoch": 2.781150159744409, + "grad_norm": 0.2140960842370987, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3482 + }, + { + "epoch": 2.7819488817891376, + "grad_norm": 0.2055736929178238, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 3483 + }, + { + "epoch": 2.7827476038338657, + "grad_norm": 0.09386937320232391, + "learning_rate": 0.0005, + "loss": 1.0369, + "step": 3484 + }, + { + "epoch": 2.783546325878594, + "grad_norm": 0.11534380912780762, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3485 + }, + { + "epoch": 2.7843450479233227, + "grad_norm": 0.19186711311340332, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3486 + }, + { + "epoch": 2.7851437699680512, + "grad_norm": 0.26858124136924744, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3487 + }, + { + "epoch": 2.7859424920127793, + "grad_norm": 0.05965370684862137, + "learning_rate": 0.0005, + "loss": 1.0355, + "step": 3488 + }, + { + "epoch": 2.786741214057508, + "grad_norm": 0.17804528772830963, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3489 + }, + { + "epoch": 2.7875399361022364, + "grad_norm": 0.1802065223455429, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 3490 + }, + { + "epoch": 2.788338658146965, + "grad_norm": 0.06634502857923508, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3491 + }, + { + "epoch": 2.7891373801916934, + "grad_norm": 0.06682102382183075, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3492 + }, + { + "epoch": 2.789936102236422, + "grad_norm": 0.08941584080457687, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3493 + }, + { + "epoch": 2.79073482428115, + "grad_norm": 0.06336037069559097, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3494 + }, + { + "epoch": 2.7915335463258786, + "grad_norm": 0.05562690272927284, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3495 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.10294149816036224, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3496 + }, + { + "epoch": 2.793130990415335, + "grad_norm": 0.11363442987203598, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3497 + }, + { + "epoch": 2.7939297124600637, + "grad_norm": 0.05790446698665619, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3498 + }, + { + "epoch": 2.7947284345047922, + "grad_norm": 0.09351370483636856, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3499 + }, + { + "epoch": 2.7955271565495208, + "grad_norm": 0.2225412130355835, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3500 + }, + { + "epoch": 2.7963258785942493, + "grad_norm": 0.21828165650367737, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3501 + }, + { + "epoch": 2.797124600638978, + "grad_norm": 0.06987733393907547, + "learning_rate": 0.0005, + "loss": 1.039, + "step": 3502 + }, + { + "epoch": 2.797923322683706, + "grad_norm": 0.14518103003501892, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 3503 + }, + { + "epoch": 2.7987220447284344, + "grad_norm": 0.24233761429786682, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3504 + }, + { + "epoch": 2.799520766773163, + "grad_norm": 0.19286365807056427, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3505 + }, + { + "epoch": 2.8003194888178915, + "grad_norm": 0.07981286942958832, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3506 + }, + { + "epoch": 2.8011182108626196, + "grad_norm": 0.050319187343120575, + "learning_rate": 0.0005, + "loss": 1.0381, + "step": 3507 + }, + { + "epoch": 2.801916932907348, + "grad_norm": 0.09955406934022903, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3508 + }, + { + "epoch": 2.8027156549520766, + "grad_norm": 0.048427898436784744, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3509 + }, + { + "epoch": 2.803514376996805, + "grad_norm": 0.0805777907371521, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3510 + }, + { + "epoch": 2.8043130990415337, + "grad_norm": 0.07289621978998184, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3511 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.04940955713391304, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3512 + }, + { + "epoch": 2.8059105431309903, + "grad_norm": 0.07228294759988785, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3513 + }, + { + "epoch": 2.806709265175719, + "grad_norm": 0.06902103871107101, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3514 + }, + { + "epoch": 2.8075079872204474, + "grad_norm": 0.056301236152648926, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3515 + }, + { + "epoch": 2.8083067092651754, + "grad_norm": 0.03880859166383743, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3516 + }, + { + "epoch": 2.809105431309904, + "grad_norm": 0.04914811998605728, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3517 + }, + { + "epoch": 2.8099041533546325, + "grad_norm": 0.04139270633459091, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 3518 + }, + { + "epoch": 2.810702875399361, + "grad_norm": 0.05118592828512192, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3519 + }, + { + "epoch": 2.8115015974440896, + "grad_norm": 0.03548616170883179, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 3520 + }, + { + "epoch": 2.812300319488818, + "grad_norm": 0.04883241280913353, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3521 + }, + { + "epoch": 2.813099041533546, + "grad_norm": 0.044492170214653015, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3522 + }, + { + "epoch": 2.8138977635782747, + "grad_norm": 0.050978366285562515, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3523 + }, + { + "epoch": 2.8146964856230032, + "grad_norm": 0.04663826525211334, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3524 + }, + { + "epoch": 2.8154952076677318, + "grad_norm": 0.06378154456615448, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3525 + }, + { + "epoch": 2.81629392971246, + "grad_norm": 0.06913618743419647, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3526 + }, + { + "epoch": 2.8170926517571884, + "grad_norm": 0.084662064909935, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3527 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 0.08352439105510712, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3528 + }, + { + "epoch": 2.8186900958466454, + "grad_norm": 0.07254189252853394, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3529 + }, + { + "epoch": 2.819488817891374, + "grad_norm": 0.04416285827755928, + "learning_rate": 0.0005, + "loss": 1.0319, + "step": 3530 + }, + { + "epoch": 2.8202875399361025, + "grad_norm": 0.056230951100587845, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3531 + }, + { + "epoch": 2.8210862619808306, + "grad_norm": 0.11055732518434525, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3532 + }, + { + "epoch": 2.821884984025559, + "grad_norm": 0.08660246431827545, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3533 + }, + { + "epoch": 2.8226837060702876, + "grad_norm": 0.0691947191953659, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3534 + }, + { + "epoch": 2.8234824281150157, + "grad_norm": 0.09254545718431473, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3535 + }, + { + "epoch": 2.8242811501597442, + "grad_norm": 0.0663340613245964, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3536 + }, + { + "epoch": 2.8250798722044728, + "grad_norm": 0.05052514374256134, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3537 + }, + { + "epoch": 2.8258785942492013, + "grad_norm": 0.08364969491958618, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3538 + }, + { + "epoch": 2.82667731629393, + "grad_norm": 0.08269570767879486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3539 + }, + { + "epoch": 2.8274760383386583, + "grad_norm": 0.06289245933294296, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3540 + }, + { + "epoch": 2.8282747603833864, + "grad_norm": 0.03565627336502075, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3541 + }, + { + "epoch": 2.829073482428115, + "grad_norm": 0.057896651327610016, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3542 + }, + { + "epoch": 2.8298722044728435, + "grad_norm": 0.046379514038562775, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3543 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.06231336295604706, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3544 + }, + { + "epoch": 2.8314696485623, + "grad_norm": 0.03983502462506294, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3545 + }, + { + "epoch": 2.8322683706070286, + "grad_norm": 0.07364759594202042, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3546 + }, + { + "epoch": 2.833067092651757, + "grad_norm": 0.11596816778182983, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3547 + }, + { + "epoch": 2.8338658146964857, + "grad_norm": 0.10731378942728043, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3548 + }, + { + "epoch": 2.834664536741214, + "grad_norm": 0.06365050375461578, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3549 + }, + { + "epoch": 2.8354632587859427, + "grad_norm": 0.055451441556215286, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3550 + }, + { + "epoch": 2.836261980830671, + "grad_norm": 0.1490558534860611, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3551 + }, + { + "epoch": 2.8370607028753994, + "grad_norm": 0.1539796143770218, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3552 + }, + { + "epoch": 2.837859424920128, + "grad_norm": 0.06760501861572266, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3553 + }, + { + "epoch": 2.838658146964856, + "grad_norm": 0.0685611367225647, + "learning_rate": 0.0005, + "loss": 1.0341, + "step": 3554 + }, + { + "epoch": 2.8394568690095845, + "grad_norm": 0.14234358072280884, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3555 + }, + { + "epoch": 2.840255591054313, + "grad_norm": 0.14428865909576416, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3556 + }, + { + "epoch": 2.8410543130990416, + "grad_norm": 0.07594695687294006, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3557 + }, + { + "epoch": 2.84185303514377, + "grad_norm": 0.040841538459062576, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3558 + }, + { + "epoch": 2.8426517571884986, + "grad_norm": 0.04991824924945831, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3559 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 0.03846943378448486, + "learning_rate": 0.0005, + "loss": 1.0359, + "step": 3560 + }, + { + "epoch": 2.844249201277955, + "grad_norm": 0.04851507395505905, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3561 + }, + { + "epoch": 2.8450479233226837, + "grad_norm": 0.0635538399219513, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3562 + }, + { + "epoch": 2.8458466453674123, + "grad_norm": 0.11812663078308105, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3563 + }, + { + "epoch": 2.8466453674121404, + "grad_norm": 0.05664098262786865, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3564 + }, + { + "epoch": 2.847444089456869, + "grad_norm": 0.03532585874199867, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3565 + }, + { + "epoch": 2.8482428115015974, + "grad_norm": 0.06758403033018112, + "learning_rate": 0.0005, + "loss": 1.0362, + "step": 3566 + }, + { + "epoch": 2.849041533546326, + "grad_norm": 0.06279300898313522, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 3567 + }, + { + "epoch": 2.8498402555910545, + "grad_norm": 0.043967198580503464, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3568 + }, + { + "epoch": 2.850638977635783, + "grad_norm": 0.04900701716542244, + "learning_rate": 0.0005, + "loss": 1.0386, + "step": 3569 + }, + { + "epoch": 2.851437699680511, + "grad_norm": 0.07339311391115189, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3570 + }, + { + "epoch": 2.8522364217252396, + "grad_norm": 0.10644743591547012, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3571 + }, + { + "epoch": 2.853035143769968, + "grad_norm": 0.10544353723526001, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3572 + }, + { + "epoch": 2.8538338658146962, + "grad_norm": 0.0590951181948185, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3573 + }, + { + "epoch": 2.8546325878594248, + "grad_norm": 0.05038939788937569, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3574 + }, + { + "epoch": 2.8554313099041533, + "grad_norm": 0.06013040617108345, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3575 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.07330521196126938, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3576 + }, + { + "epoch": 2.8570287539936103, + "grad_norm": 0.12049853056669235, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3577 + }, + { + "epoch": 2.857827476038339, + "grad_norm": 0.13056780397891998, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3578 + }, + { + "epoch": 2.858626198083067, + "grad_norm": 0.12987029552459717, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3579 + }, + { + "epoch": 2.8594249201277955, + "grad_norm": 0.08681001514196396, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3580 + }, + { + "epoch": 2.860223642172524, + "grad_norm": 0.060947105288505554, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3581 + }, + { + "epoch": 2.8610223642172525, + "grad_norm": 0.10896368324756622, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3582 + }, + { + "epoch": 2.8618210862619806, + "grad_norm": 0.1251460760831833, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3583 + }, + { + "epoch": 2.862619808306709, + "grad_norm": 0.035174671560525894, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3584 + }, + { + "epoch": 2.8634185303514377, + "grad_norm": 0.12026303261518478, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3585 + }, + { + "epoch": 2.864217252396166, + "grad_norm": 0.16679063439369202, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 3586 + }, + { + "epoch": 2.8650159744408947, + "grad_norm": 0.19229409098625183, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3587 + }, + { + "epoch": 2.8658146964856233, + "grad_norm": 0.17964699864387512, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3588 + }, + { + "epoch": 2.8666134185303513, + "grad_norm": 0.10671430081129074, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3589 + }, + { + "epoch": 2.86741214057508, + "grad_norm": 0.04453161358833313, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 3590 + }, + { + "epoch": 2.8682108626198084, + "grad_norm": 0.1531655639410019, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 3591 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.19321779906749725, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3592 + }, + { + "epoch": 2.869808306709265, + "grad_norm": 0.19540782272815704, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3593 + }, + { + "epoch": 2.8706070287539935, + "grad_norm": 0.22210878133773804, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3594 + }, + { + "epoch": 2.871405750798722, + "grad_norm": 0.2089247703552246, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 3595 + }, + { + "epoch": 2.8722044728434506, + "grad_norm": 0.11910446733236313, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3596 + }, + { + "epoch": 2.873003194888179, + "grad_norm": 0.05230247974395752, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3597 + }, + { + "epoch": 2.873801916932907, + "grad_norm": 0.09492263197898865, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3598 + }, + { + "epoch": 2.8746006389776357, + "grad_norm": 0.1396690160036087, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3599 + }, + { + "epoch": 2.8753993610223643, + "grad_norm": 0.12218718230724335, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3600 + }, + { + "epoch": 2.876198083067093, + "grad_norm": 0.05510007217526436, + "learning_rate": 0.0005, + "loss": 1.0334, + "step": 3601 + }, + { + "epoch": 2.876996805111821, + "grad_norm": 0.04949348792433739, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3602 + }, + { + "epoch": 2.8777955271565494, + "grad_norm": 0.06522537767887115, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3603 + }, + { + "epoch": 2.878594249201278, + "grad_norm": 0.034176018089056015, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3604 + }, + { + "epoch": 2.8793929712460065, + "grad_norm": 0.07579770684242249, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 3605 + }, + { + "epoch": 2.880191693290735, + "grad_norm": 0.09512948244810104, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 3606 + }, + { + "epoch": 2.8809904153354635, + "grad_norm": 0.059753213077783585, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3607 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.2461470365524292, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3608 + }, + { + "epoch": 2.88258785942492, + "grad_norm": 0.11298660188913345, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3609 + }, + { + "epoch": 2.8833865814696487, + "grad_norm": 0.20638997852802277, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3610 + }, + { + "epoch": 2.8841853035143767, + "grad_norm": 0.2394232600927353, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3611 + }, + { + "epoch": 2.8849840255591053, + "grad_norm": 0.15168963372707367, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3612 + }, + { + "epoch": 2.885782747603834, + "grad_norm": 0.03990825638175011, + "learning_rate": 0.0005, + "loss": 1.0343, + "step": 3613 + }, + { + "epoch": 2.8865814696485623, + "grad_norm": 0.1725347936153412, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3614 + }, + { + "epoch": 2.887380191693291, + "grad_norm": 0.20821869373321533, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3615 + }, + { + "epoch": 2.8881789137380194, + "grad_norm": 0.14441269636154175, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3616 + }, + { + "epoch": 2.8889776357827475, + "grad_norm": 0.037162624299526215, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3617 + }, + { + "epoch": 2.889776357827476, + "grad_norm": 0.11550657451152802, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3618 + }, + { + "epoch": 2.8905750798722045, + "grad_norm": 0.15214277803897858, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3619 + }, + { + "epoch": 2.891373801916933, + "grad_norm": 0.09059946238994598, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 3620 + }, + { + "epoch": 2.892172523961661, + "grad_norm": 0.03436599299311638, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3621 + }, + { + "epoch": 2.8929712460063897, + "grad_norm": 0.0839625746011734, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3622 + }, + { + "epoch": 2.893769968051118, + "grad_norm": 0.1618664264678955, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3623 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.08216597139835358, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3624 + }, + { + "epoch": 2.8953674121405752, + "grad_norm": 0.06303965300321579, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3625 + }, + { + "epoch": 2.8961661341853038, + "grad_norm": 0.050278183072805405, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3626 + }, + { + "epoch": 2.896964856230032, + "grad_norm": 0.04620242863893509, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3627 + }, + { + "epoch": 2.8977635782747604, + "grad_norm": 0.04937691614031792, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3628 + }, + { + "epoch": 2.898562300319489, + "grad_norm": 0.056928347796201706, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3629 + }, + { + "epoch": 2.899361022364217, + "grad_norm": 0.04932256042957306, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3630 + }, + { + "epoch": 2.9001597444089455, + "grad_norm": 0.04320303350687027, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3631 + }, + { + "epoch": 2.900958466453674, + "grad_norm": 0.08589868247509003, + "learning_rate": 0.0005, + "loss": 1.0384, + "step": 3632 + }, + { + "epoch": 2.9017571884984026, + "grad_norm": 0.11458484083414078, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3633 + }, + { + "epoch": 2.902555910543131, + "grad_norm": 0.13549752533435822, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3634 + }, + { + "epoch": 2.9033546325878596, + "grad_norm": 0.1327086091041565, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3635 + }, + { + "epoch": 2.9041533546325877, + "grad_norm": 0.08295682817697525, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3636 + }, + { + "epoch": 2.9049520766773163, + "grad_norm": 0.05216526240110397, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3637 + }, + { + "epoch": 2.905750798722045, + "grad_norm": 0.11048691719770432, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3638 + }, + { + "epoch": 2.9065495207667733, + "grad_norm": 0.17681372165679932, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3639 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.16901300847530365, + "learning_rate": 0.0005, + "loss": 1.0361, + "step": 3640 + }, + { + "epoch": 2.90814696485623, + "grad_norm": 0.10261020064353943, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 3641 + }, + { + "epoch": 2.9089456869009584, + "grad_norm": 0.042478349059820175, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3642 + }, + { + "epoch": 2.909744408945687, + "grad_norm": 0.11727496981620789, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3643 + }, + { + "epoch": 2.9105431309904155, + "grad_norm": 0.14884977042675018, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 3644 + }, + { + "epoch": 2.911341853035144, + "grad_norm": 0.047877270728349686, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3645 + }, + { + "epoch": 2.912140575079872, + "grad_norm": 0.11930714547634125, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3646 + }, + { + "epoch": 2.9129392971246006, + "grad_norm": 0.1873956024646759, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3647 + }, + { + "epoch": 2.913738019169329, + "grad_norm": 0.22310249507427216, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3648 + }, + { + "epoch": 2.9145367412140573, + "grad_norm": 0.21259911358356476, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3649 + }, + { + "epoch": 2.915335463258786, + "grad_norm": 0.11584217846393585, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3650 + }, + { + "epoch": 2.9161341853035143, + "grad_norm": 0.04092720150947571, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3651 + }, + { + "epoch": 2.916932907348243, + "grad_norm": 0.14542047679424286, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3652 + }, + { + "epoch": 2.9177316293929714, + "grad_norm": 0.16328515112400055, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3653 + }, + { + "epoch": 2.9185303514377, + "grad_norm": 0.11284583806991577, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3654 + }, + { + "epoch": 2.919329073482428, + "grad_norm": 0.03723357245326042, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3655 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.1347448229789734, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 3656 + }, + { + "epoch": 2.920926517571885, + "grad_norm": 0.1697797328233719, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3657 + }, + { + "epoch": 2.9217252396166136, + "grad_norm": 0.12122484296560287, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3658 + }, + { + "epoch": 2.9225239616613417, + "grad_norm": 0.043503791093826294, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3659 + }, + { + "epoch": 2.92332268370607, + "grad_norm": 0.1600242555141449, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3660 + }, + { + "epoch": 2.9241214057507987, + "grad_norm": 0.21065576374530792, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3661 + }, + { + "epoch": 2.9249201277955272, + "grad_norm": 0.16726253926753998, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3662 + }, + { + "epoch": 2.9257188498402558, + "grad_norm": 0.09178615361452103, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3663 + }, + { + "epoch": 2.9265175718849843, + "grad_norm": 0.0447201170027256, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3664 + }, + { + "epoch": 2.9273162939297124, + "grad_norm": 0.10462333261966705, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3665 + }, + { + "epoch": 2.928115015974441, + "grad_norm": 0.08236772567033768, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3666 + }, + { + "epoch": 2.9289137380191694, + "grad_norm": 0.06551375985145569, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3667 + }, + { + "epoch": 2.9297124600638975, + "grad_norm": 0.1531982123851776, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 3668 + }, + { + "epoch": 2.930511182108626, + "grad_norm": 0.19483166933059692, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 3669 + }, + { + "epoch": 2.9313099041533546, + "grad_norm": 0.12347809225320816, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 3670 + }, + { + "epoch": 2.932108626198083, + "grad_norm": 0.05494467169046402, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3671 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 0.2280847579240799, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3672 + }, + { + "epoch": 2.93370607028754, + "grad_norm": 0.30344241857528687, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3673 + }, + { + "epoch": 2.9345047923322682, + "grad_norm": 0.243449404835701, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 3674 + }, + { + "epoch": 2.9353035143769968, + "grad_norm": 0.11542543768882751, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3675 + }, + { + "epoch": 2.9361022364217253, + "grad_norm": 0.09501481056213379, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 3676 + }, + { + "epoch": 2.936900958466454, + "grad_norm": 0.2299363762140274, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3677 + }, + { + "epoch": 2.937699680511182, + "grad_norm": 0.15020152926445007, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3678 + }, + { + "epoch": 2.9384984025559104, + "grad_norm": 0.0655093789100647, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3679 + }, + { + "epoch": 2.939297124600639, + "grad_norm": 0.15242713689804077, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3680 + }, + { + "epoch": 2.9400958466453675, + "grad_norm": 0.13315139710903168, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3681 + }, + { + "epoch": 2.940894568690096, + "grad_norm": 0.05966462939977646, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 3682 + }, + { + "epoch": 2.9416932907348246, + "grad_norm": 0.08146806806325912, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3683 + }, + { + "epoch": 2.9424920127795526, + "grad_norm": 0.13615436851978302, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3684 + }, + { + "epoch": 2.943290734824281, + "grad_norm": 0.10889092832803726, + "learning_rate": 0.0005, + "loss": 1.043, + "step": 3685 + }, + { + "epoch": 2.9440894568690097, + "grad_norm": 0.03455124795436859, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3686 + }, + { + "epoch": 2.9448881789137378, + "grad_norm": 0.07490532845258713, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3687 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.08072194457054138, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3688 + }, + { + "epoch": 2.946485623003195, + "grad_norm": 0.03630111739039421, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3689 + }, + { + "epoch": 2.9472843450479234, + "grad_norm": 0.09075939655303955, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3690 + }, + { + "epoch": 2.948083067092652, + "grad_norm": 0.1618475615978241, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3691 + }, + { + "epoch": 2.9488817891373804, + "grad_norm": 0.18354517221450806, + "learning_rate": 0.0005, + "loss": 1.0398, + "step": 3692 + }, + { + "epoch": 2.9496805111821085, + "grad_norm": 0.170358344912529, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3693 + }, + { + "epoch": 2.950479233226837, + "grad_norm": 0.10800250619649887, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3694 + }, + { + "epoch": 2.9512779552715656, + "grad_norm": 0.03771398589015007, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3695 + }, + { + "epoch": 2.952076677316294, + "grad_norm": 0.07931157946586609, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 3696 + }, + { + "epoch": 2.952875399361022, + "grad_norm": 0.08149557560682297, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3697 + }, + { + "epoch": 2.9536741214057507, + "grad_norm": 0.05122899264097214, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3698 + }, + { + "epoch": 2.9544728434504792, + "grad_norm": 0.040845707058906555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3699 + }, + { + "epoch": 2.9552715654952078, + "grad_norm": 0.11444225907325745, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3700 + }, + { + "epoch": 2.9560702875399363, + "grad_norm": 0.20140959322452545, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3701 + }, + { + "epoch": 2.956869009584665, + "grad_norm": 0.24982111155986786, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3702 + }, + { + "epoch": 2.957667731629393, + "grad_norm": 0.21290510892868042, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3703 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.11526014655828476, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3704 + }, + { + "epoch": 2.95926517571885, + "grad_norm": 0.03769242390990257, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3705 + }, + { + "epoch": 2.960063897763578, + "grad_norm": 0.091837577521801, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3706 + }, + { + "epoch": 2.9608626198083066, + "grad_norm": 0.0956759825348854, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 3707 + }, + { + "epoch": 2.961661341853035, + "grad_norm": 0.06945781409740448, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3708 + }, + { + "epoch": 2.9624600638977636, + "grad_norm": 0.03904029354453087, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3709 + }, + { + "epoch": 2.963258785942492, + "grad_norm": 0.1264238953590393, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 3710 + }, + { + "epoch": 2.9640575079872207, + "grad_norm": 0.1689605861902237, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3711 + }, + { + "epoch": 2.9648562300319488, + "grad_norm": 0.15059368312358856, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3712 + }, + { + "epoch": 2.9656549520766773, + "grad_norm": 0.12976346909999847, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3713 + }, + { + "epoch": 2.966453674121406, + "grad_norm": 0.08460741490125656, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3714 + }, + { + "epoch": 2.9672523961661343, + "grad_norm": 0.04914790764451027, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3715 + }, + { + "epoch": 2.9680511182108624, + "grad_norm": 0.09629235416650772, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3716 + }, + { + "epoch": 2.968849840255591, + "grad_norm": 0.0895731970667839, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 3717 + }, + { + "epoch": 2.9696485623003195, + "grad_norm": 0.039528124034404755, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3718 + }, + { + "epoch": 2.970447284345048, + "grad_norm": 0.12843455374240875, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3719 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.1754530519247055, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3720 + }, + { + "epoch": 2.972044728434505, + "grad_norm": 0.14169782400131226, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3721 + }, + { + "epoch": 2.972843450479233, + "grad_norm": 0.04416975378990173, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3722 + }, + { + "epoch": 2.9736421725239617, + "grad_norm": 0.1259031444787979, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3723 + }, + { + "epoch": 2.97444089456869, + "grad_norm": 0.17667949199676514, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3724 + }, + { + "epoch": 2.9752396166134183, + "grad_norm": 0.1213974729180336, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3725 + }, + { + "epoch": 2.976038338658147, + "grad_norm": 0.052554335445165634, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3726 + }, + { + "epoch": 2.9768370607028753, + "grad_norm": 0.13257208466529846, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3727 + }, + { + "epoch": 2.977635782747604, + "grad_norm": 0.1463504135608673, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 3728 + }, + { + "epoch": 2.9784345047923324, + "grad_norm": 0.08546306937932968, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3729 + }, + { + "epoch": 2.979233226837061, + "grad_norm": 0.04226094111800194, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 3730 + }, + { + "epoch": 2.980031948881789, + "grad_norm": 0.0924859419465065, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3731 + }, + { + "epoch": 2.9808306709265175, + "grad_norm": 0.1094423234462738, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3732 + }, + { + "epoch": 2.981629392971246, + "grad_norm": 0.11132006347179413, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3733 + }, + { + "epoch": 2.9824281150159746, + "grad_norm": 0.11010250449180603, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3734 + }, + { + "epoch": 2.9832268370607027, + "grad_norm": 0.10370460152626038, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3735 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 0.08460240811109543, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3736 + }, + { + "epoch": 2.9848242811501597, + "grad_norm": 0.06218400225043297, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3737 + }, + { + "epoch": 2.9856230031948883, + "grad_norm": 0.07446395605802536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3738 + }, + { + "epoch": 2.986421725239617, + "grad_norm": 0.06072726845741272, + "learning_rate": 0.0005, + "loss": 1.0376, + "step": 3739 + }, + { + "epoch": 2.987220447284345, + "grad_norm": 0.07607559114694595, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3740 + }, + { + "epoch": 2.9880191693290734, + "grad_norm": 0.151380717754364, + "learning_rate": 0.0005, + "loss": 1.0383, + "step": 3741 + }, + { + "epoch": 2.988817891373802, + "grad_norm": 0.24132277071475983, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3742 + }, + { + "epoch": 2.9896166134185305, + "grad_norm": 0.2346547245979309, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3743 + }, + { + "epoch": 2.9904153354632586, + "grad_norm": 0.090092234313488, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3744 + }, + { + "epoch": 2.991214057507987, + "grad_norm": 0.10230003297328949, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3745 + }, + { + "epoch": 2.9920127795527156, + "grad_norm": 0.17678654193878174, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 3746 + }, + { + "epoch": 2.992811501597444, + "grad_norm": 0.16382110118865967, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3747 + }, + { + "epoch": 2.9936102236421727, + "grad_norm": 0.06456442922353745, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3748 + }, + { + "epoch": 2.994408945686901, + "grad_norm": 0.1774967759847641, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 3749 + }, + { + "epoch": 2.9952076677316293, + "grad_norm": 0.19274447858333588, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3750 + }, + { + "epoch": 2.996006389776358, + "grad_norm": 0.10767998546361923, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3751 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.07864238321781158, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 3752 + }, + { + "epoch": 2.997603833865815, + "grad_norm": 0.21339190006256104, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 3753 + }, + { + "epoch": 2.998402555910543, + "grad_norm": 0.2560347616672516, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 3754 + }, + { + "epoch": 2.9992012779552715, + "grad_norm": 0.15730907022953033, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 3755 + }, + { + "epoch": 3.0, + "grad_norm": 0.09766457974910736, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3756 + }, + { + "epoch": 3.0007987220447285, + "grad_norm": 0.24393433332443237, + "learning_rate": 0.0005, + "loss": 1.0389, + "step": 3757 + }, + { + "epoch": 3.001597444089457, + "grad_norm": 0.17650263011455536, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 3758 + }, + { + "epoch": 3.002396166134185, + "grad_norm": 0.06490518152713776, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3759 + }, + { + "epoch": 3.0031948881789137, + "grad_norm": 0.10893388092517853, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3760 + }, + { + "epoch": 3.003993610223642, + "grad_norm": 0.13606922328472137, + "learning_rate": 0.0005, + "loss": 1.0408, + "step": 3761 + }, + { + "epoch": 3.0047923322683707, + "grad_norm": 0.07880546152591705, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3762 + }, + { + "epoch": 3.0055910543130993, + "grad_norm": 0.04203686863183975, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3763 + }, + { + "epoch": 3.0063897763578273, + "grad_norm": 0.07509997487068176, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 3764 + }, + { + "epoch": 3.007188498402556, + "grad_norm": 0.08529910445213318, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3765 + }, + { + "epoch": 3.0079872204472844, + "grad_norm": 0.05542825534939766, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3766 + }, + { + "epoch": 3.008785942492013, + "grad_norm": 0.08245155215263367, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3767 + }, + { + "epoch": 3.009584664536741, + "grad_norm": 0.09580255299806595, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3768 + }, + { + "epoch": 3.0103833865814695, + "grad_norm": 0.08233854174613953, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3769 + }, + { + "epoch": 3.011182108626198, + "grad_norm": 0.0589553639292717, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 3770 + }, + { + "epoch": 3.0119808306709266, + "grad_norm": 0.09862494468688965, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3771 + }, + { + "epoch": 3.012779552715655, + "grad_norm": 0.1471278816461563, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3772 + }, + { + "epoch": 3.013578274760383, + "grad_norm": 0.1422986537218094, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3773 + }, + { + "epoch": 3.0143769968051117, + "grad_norm": 0.06627846509218216, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3774 + }, + { + "epoch": 3.0151757188498403, + "grad_norm": 0.04936077445745468, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3775 + }, + { + "epoch": 3.015974440894569, + "grad_norm": 0.0745953619480133, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3776 + }, + { + "epoch": 3.0167731629392973, + "grad_norm": 0.0725102499127388, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3777 + }, + { + "epoch": 3.0175718849840254, + "grad_norm": 0.04181717708706856, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3778 + }, + { + "epoch": 3.018370607028754, + "grad_norm": 0.09955357760190964, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3779 + }, + { + "epoch": 3.0191693290734825, + "grad_norm": 0.21014735102653503, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3780 + }, + { + "epoch": 3.019968051118211, + "grad_norm": 0.30597689747810364, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3781 + }, + { + "epoch": 3.0207667731629395, + "grad_norm": 0.2930602431297302, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3782 + }, + { + "epoch": 3.0215654952076676, + "grad_norm": 0.1190100908279419, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3783 + }, + { + "epoch": 3.022364217252396, + "grad_norm": 0.0655524879693985, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3784 + }, + { + "epoch": 3.0231629392971247, + "grad_norm": 0.12062554061412811, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3785 + }, + { + "epoch": 3.023961661341853, + "grad_norm": 0.09680327773094177, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3786 + }, + { + "epoch": 3.0247603833865813, + "grad_norm": 0.0555860660970211, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3787 + }, + { + "epoch": 3.02555910543131, + "grad_norm": 0.1271962672472, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3788 + }, + { + "epoch": 3.0263578274760383, + "grad_norm": 0.12178758531808853, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3789 + }, + { + "epoch": 3.027156549520767, + "grad_norm": 0.09623143821954727, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3790 + }, + { + "epoch": 3.0279552715654954, + "grad_norm": 0.04004101827740669, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3791 + }, + { + "epoch": 3.0287539936102235, + "grad_norm": 0.14001014828681946, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3792 + }, + { + "epoch": 3.029552715654952, + "grad_norm": 0.24241770803928375, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3793 + }, + { + "epoch": 3.0303514376996805, + "grad_norm": 0.29141902923583984, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3794 + }, + { + "epoch": 3.031150159744409, + "grad_norm": 0.22814971208572388, + "learning_rate": 0.0005, + "loss": 1.0397, + "step": 3795 + }, + { + "epoch": 3.0319488817891376, + "grad_norm": 0.08114828914403915, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 3796 + }, + { + "epoch": 3.0327476038338657, + "grad_norm": 0.08104736357927322, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3797 + }, + { + "epoch": 3.033546325878594, + "grad_norm": 0.12007702887058258, + "learning_rate": 0.0005, + "loss": 1.0374, + "step": 3798 + }, + { + "epoch": 3.0343450479233227, + "grad_norm": 0.06497872620820999, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 3799 + }, + { + "epoch": 3.0351437699680512, + "grad_norm": 0.07407233864068985, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3800 + }, + { + "epoch": 3.0359424920127798, + "grad_norm": 0.16386932134628296, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 3801 + }, + { + "epoch": 3.036741214057508, + "grad_norm": 0.21633599698543549, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3802 + }, + { + "epoch": 3.0375399361022364, + "grad_norm": 0.19224147498607635, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3803 + }, + { + "epoch": 3.038338658146965, + "grad_norm": 0.04962728172540665, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 3804 + }, + { + "epoch": 3.0391373801916934, + "grad_norm": 0.17984353005886078, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 3805 + }, + { + "epoch": 3.0399361022364215, + "grad_norm": 0.31483346223831177, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3806 + }, + { + "epoch": 3.04073482428115, + "grad_norm": 0.27175095677375793, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3807 + }, + { + "epoch": 3.0415335463258786, + "grad_norm": 0.06302175670862198, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3808 + }, + { + "epoch": 3.042332268370607, + "grad_norm": 0.18620255589485168, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3809 + }, + { + "epoch": 3.0431309904153356, + "grad_norm": 0.23254868388175964, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3810 + }, + { + "epoch": 3.0439297124600637, + "grad_norm": 0.08763844519853592, + "learning_rate": 0.0005, + "loss": 1.0399, + "step": 3811 + }, + { + "epoch": 3.0447284345047922, + "grad_norm": 0.13173392415046692, + "learning_rate": 0.0005, + "loss": 1.0401, + "step": 3812 + }, + { + "epoch": 3.0455271565495208, + "grad_norm": 0.24171577394008636, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3813 + }, + { + "epoch": 3.0463258785942493, + "grad_norm": 0.17649634182453156, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 3814 + }, + { + "epoch": 3.047124600638978, + "grad_norm": 0.03800780326128006, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3815 + }, + { + "epoch": 3.047923322683706, + "grad_norm": 0.20039476454257965, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3816 + }, + { + "epoch": 3.0487220447284344, + "grad_norm": 0.26794761419296265, + "learning_rate": 0.0005, + "loss": 1.0394, + "step": 3817 + }, + { + "epoch": 3.049520766773163, + "grad_norm": 0.18026290833950043, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3818 + }, + { + "epoch": 3.0503194888178915, + "grad_norm": 0.07298897206783295, + "learning_rate": 0.0005, + "loss": 1.0332, + "step": 3819 + }, + { + "epoch": 3.0511182108626196, + "grad_norm": 0.11078597605228424, + "learning_rate": 0.0005, + "loss": 1.0347, + "step": 3820 + }, + { + "epoch": 3.051916932907348, + "grad_norm": 0.13672129809856415, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3821 + }, + { + "epoch": 3.0527156549520766, + "grad_norm": 0.11172370612621307, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3822 + }, + { + "epoch": 3.053514376996805, + "grad_norm": 0.09000302106142044, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3823 + }, + { + "epoch": 3.0543130990415337, + "grad_norm": 0.055291030555963516, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3824 + }, + { + "epoch": 3.055111821086262, + "grad_norm": 0.05691349133849144, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3825 + }, + { + "epoch": 3.0559105431309903, + "grad_norm": 0.0744122862815857, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3826 + }, + { + "epoch": 3.056709265175719, + "grad_norm": 0.06438847631216049, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 3827 + }, + { + "epoch": 3.0575079872204474, + "grad_norm": 0.0926717221736908, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 3828 + }, + { + "epoch": 3.058306709265176, + "grad_norm": 0.15286727249622345, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3829 + }, + { + "epoch": 3.059105431309904, + "grad_norm": 0.2049989253282547, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3830 + }, + { + "epoch": 3.0599041533546325, + "grad_norm": 0.1832154393196106, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3831 + }, + { + "epoch": 3.060702875399361, + "grad_norm": 0.0953374058008194, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 3832 + }, + { + "epoch": 3.0615015974440896, + "grad_norm": 0.063878633081913, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3833 + }, + { + "epoch": 3.062300319488818, + "grad_norm": 0.17062409222126007, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3834 + }, + { + "epoch": 3.063099041533546, + "grad_norm": 0.23467828333377838, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 3835 + }, + { + "epoch": 3.0638977635782747, + "grad_norm": 0.19458062946796417, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3836 + }, + { + "epoch": 3.0646964856230032, + "grad_norm": 0.06614453345537186, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 3837 + }, + { + "epoch": 3.0654952076677318, + "grad_norm": 0.1250256896018982, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3838 + }, + { + "epoch": 3.06629392971246, + "grad_norm": 0.2399163395166397, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3839 + }, + { + "epoch": 3.0670926517571884, + "grad_norm": 0.22544947266578674, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3840 + }, + { + "epoch": 3.067891373801917, + "grad_norm": 0.0710826963186264, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 3841 + }, + { + "epoch": 3.0686900958466454, + "grad_norm": 0.12259501218795776, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 3842 + }, + { + "epoch": 3.069488817891374, + "grad_norm": 0.1313357651233673, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3843 + }, + { + "epoch": 3.070287539936102, + "grad_norm": 0.05492740869522095, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3844 + }, + { + "epoch": 3.0710862619808306, + "grad_norm": 0.08860959857702255, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3845 + }, + { + "epoch": 3.071884984025559, + "grad_norm": 0.12556305527687073, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3846 + }, + { + "epoch": 3.0726837060702876, + "grad_norm": 0.10780923813581467, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3847 + }, + { + "epoch": 3.073482428115016, + "grad_norm": 0.0587402880191803, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3848 + }, + { + "epoch": 3.0742811501597442, + "grad_norm": 0.06155085563659668, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3849 + }, + { + "epoch": 3.0750798722044728, + "grad_norm": 0.07258733361959457, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3850 + }, + { + "epoch": 3.0758785942492013, + "grad_norm": 0.060939520597457886, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3851 + }, + { + "epoch": 3.07667731629393, + "grad_norm": 0.07125407457351685, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3852 + }, + { + "epoch": 3.0774760383386583, + "grad_norm": 0.15338753163814545, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3853 + }, + { + "epoch": 3.0782747603833864, + "grad_norm": 0.18328991532325745, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 3854 + }, + { + "epoch": 3.079073482428115, + "grad_norm": 0.1338629275560379, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3855 + }, + { + "epoch": 3.0798722044728435, + "grad_norm": 0.042017024010419846, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3856 + }, + { + "epoch": 3.080670926517572, + "grad_norm": 0.13696196675300598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 3857 + }, + { + "epoch": 3.0814696485623, + "grad_norm": 0.17552919685840607, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3858 + }, + { + "epoch": 3.0822683706070286, + "grad_norm": 0.09906235337257385, + "learning_rate": 0.0005, + "loss": 1.0439, + "step": 3859 + }, + { + "epoch": 3.083067092651757, + "grad_norm": 0.057398926466703415, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3860 + }, + { + "epoch": 3.0838658146964857, + "grad_norm": 0.12260781973600388, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3861 + }, + { + "epoch": 3.084664536741214, + "grad_norm": 0.12672549486160278, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 3862 + }, + { + "epoch": 3.0854632587859423, + "grad_norm": 0.07239031046628952, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 3863 + }, + { + "epoch": 3.086261980830671, + "grad_norm": 0.0928259864449501, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 3864 + }, + { + "epoch": 3.0870607028753994, + "grad_norm": 0.2161056250333786, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3865 + }, + { + "epoch": 3.087859424920128, + "grad_norm": 0.21302388608455658, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 3866 + }, + { + "epoch": 3.0886581469648564, + "grad_norm": 0.10730110853910446, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 3867 + }, + { + "epoch": 3.0894568690095845, + "grad_norm": 0.06801975518465042, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3868 + }, + { + "epoch": 3.090255591054313, + "grad_norm": 0.09036632627248764, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3869 + }, + { + "epoch": 3.0910543130990416, + "grad_norm": 0.1344052255153656, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3870 + }, + { + "epoch": 3.09185303514377, + "grad_norm": 0.10774482041597366, + "learning_rate": 0.0005, + "loss": 1.0373, + "step": 3871 + }, + { + "epoch": 3.0926517571884986, + "grad_norm": 0.06824023276567459, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 3872 + }, + { + "epoch": 3.0934504792332267, + "grad_norm": 0.11959507316350937, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 3873 + }, + { + "epoch": 3.094249201277955, + "grad_norm": 0.14943768084049225, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 3874 + }, + { + "epoch": 3.0950479233226837, + "grad_norm": 0.13593481481075287, + "learning_rate": 0.0005, + "loss": 1.0411, + "step": 3875 + }, + { + "epoch": 3.0958466453674123, + "grad_norm": 0.06872473657131195, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 3876 + }, + { + "epoch": 3.0966453674121404, + "grad_norm": 0.07243353873491287, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3877 + }, + { + "epoch": 3.097444089456869, + "grad_norm": 0.07884293049573898, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3878 + }, + { + "epoch": 3.0982428115015974, + "grad_norm": 0.09574474394321442, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3879 + }, + { + "epoch": 3.099041533546326, + "grad_norm": 0.09028270840644836, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3880 + }, + { + "epoch": 3.0998402555910545, + "grad_norm": 0.056680940091609955, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3881 + }, + { + "epoch": 3.1006389776357826, + "grad_norm": 0.13817615807056427, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 3882 + }, + { + "epoch": 3.101437699680511, + "grad_norm": 0.16102705895900726, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3883 + }, + { + "epoch": 3.1022364217252396, + "grad_norm": 0.08887791633605957, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 3884 + }, + { + "epoch": 3.103035143769968, + "grad_norm": 0.055100735276937485, + "learning_rate": 0.0005, + "loss": 1.04, + "step": 3885 + }, + { + "epoch": 3.1038338658146967, + "grad_norm": 0.10710839927196503, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 3886 + }, + { + "epoch": 3.1046325878594248, + "grad_norm": 0.09228713810443878, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3887 + }, + { + "epoch": 3.1054313099041533, + "grad_norm": 0.04602783918380737, + "learning_rate": 0.0005, + "loss": 1.0403, + "step": 3888 + }, + { + "epoch": 3.106230031948882, + "grad_norm": 0.03584764152765274, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 3889 + }, + { + "epoch": 3.1070287539936103, + "grad_norm": 0.04486532881855965, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 3890 + }, + { + "epoch": 3.107827476038339, + "grad_norm": 0.036488354206085205, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 3891 + }, + { + "epoch": 3.108626198083067, + "grad_norm": 0.04213477671146393, + "learning_rate": 0.0005, + "loss": 1.0392, + "step": 3892 + }, + { + "epoch": 3.1094249201277955, + "grad_norm": 0.03840509057044983, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 3893 + }, + { + "epoch": 3.110223642172524, + "grad_norm": 0.04800419509410858, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3894 + }, + { + "epoch": 3.1110223642172525, + "grad_norm": 0.06467507034540176, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3895 + }, + { + "epoch": 3.1118210862619806, + "grad_norm": 0.05736416578292847, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 3896 + }, + { + "epoch": 3.112619808306709, + "grad_norm": 0.03337489813566208, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3897 + }, + { + "epoch": 3.1134185303514377, + "grad_norm": 0.088229238986969, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 3898 + }, + { + "epoch": 3.114217252396166, + "grad_norm": 0.1492392122745514, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3899 + }, + { + "epoch": 3.1150159744408947, + "grad_norm": 0.1699269413948059, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 3900 + }, + { + "epoch": 3.115814696485623, + "grad_norm": 0.11532948911190033, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 3901 + }, + { + "epoch": 3.1166134185303513, + "grad_norm": 0.030054764822125435, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 3902 + }, + { + "epoch": 3.11741214057508, + "grad_norm": 0.11079075932502747, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 3903 + }, + { + "epoch": 3.1182108626198084, + "grad_norm": 0.15733082592487335, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 3904 + }, + { + "epoch": 3.119009584664537, + "grad_norm": 0.12520034611225128, + "learning_rate": 0.0005, + "loss": 1.0358, + "step": 3905 + }, + { + "epoch": 3.119808306709265, + "grad_norm": 0.03382280096411705, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 3906 + }, + { + "epoch": 3.1206070287539935, + "grad_norm": 0.11951576173305511, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 3907 + }, + { + "epoch": 3.121405750798722, + "grad_norm": 0.2123839259147644, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 3908 + }, + { + "epoch": 3.1222044728434506, + "grad_norm": 0.15437674522399902, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3909 + }, + { + "epoch": 3.123003194888179, + "grad_norm": 0.06463608890771866, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3910 + }, + { + "epoch": 3.123801916932907, + "grad_norm": 0.10830746591091156, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3911 + }, + { + "epoch": 3.1246006389776357, + "grad_norm": 0.17621003091335297, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 3912 + }, + { + "epoch": 3.1253993610223643, + "grad_norm": 0.12417379021644592, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 3913 + }, + { + "epoch": 3.126198083067093, + "grad_norm": 0.05364898219704628, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 3914 + }, + { + "epoch": 3.126996805111821, + "grad_norm": 0.17589502036571503, + "learning_rate": 0.0005, + "loss": 1.0396, + "step": 3915 + }, + { + "epoch": 3.1277955271565494, + "grad_norm": 0.249656081199646, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3916 + }, + { + "epoch": 3.128594249201278, + "grad_norm": 0.1800973266363144, + "learning_rate": 0.0005, + "loss": 1.0393, + "step": 3917 + }, + { + "epoch": 3.1293929712460065, + "grad_norm": 0.09763745218515396, + "learning_rate": 0.0005, + "loss": 1.0378, + "step": 3918 + }, + { + "epoch": 3.130191693290735, + "grad_norm": 0.10953835397958755, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 3919 + }, + { + "epoch": 3.130990415335463, + "grad_norm": 0.17490456998348236, + "learning_rate": 0.0005, + "loss": 1.0427, + "step": 3920 + }, + { + "epoch": 3.1317891373801916, + "grad_norm": 0.11533153057098389, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 3921 + }, + { + "epoch": 3.13258785942492, + "grad_norm": 0.07494231313467026, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3922 + }, + { + "epoch": 3.1333865814696487, + "grad_norm": 0.14954763650894165, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 3923 + }, + { + "epoch": 3.134185303514377, + "grad_norm": 0.18061646819114685, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 3924 + }, + { + "epoch": 3.1349840255591053, + "grad_norm": 0.10419650375843048, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3925 + }, + { + "epoch": 3.135782747603834, + "grad_norm": 0.04677566513419151, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 3926 + }, + { + "epoch": 3.1365814696485623, + "grad_norm": 0.12846903502941132, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3927 + }, + { + "epoch": 3.137380191693291, + "grad_norm": 0.11824795603752136, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 3928 + }, + { + "epoch": 3.1381789137380194, + "grad_norm": 0.04194530099630356, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3929 + }, + { + "epoch": 3.1389776357827475, + "grad_norm": 0.15154412388801575, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 3930 + }, + { + "epoch": 3.139776357827476, + "grad_norm": 0.19073615968227386, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 3931 + }, + { + "epoch": 3.1405750798722045, + "grad_norm": 0.12614648044109344, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3932 + }, + { + "epoch": 3.141373801916933, + "grad_norm": 0.03434520214796066, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 3933 + }, + { + "epoch": 3.142172523961661, + "grad_norm": 0.11913489550352097, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 3934 + }, + { + "epoch": 3.1429712460063897, + "grad_norm": 0.16297172009944916, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 3935 + }, + { + "epoch": 3.143769968051118, + "grad_norm": 0.15605789422988892, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 3936 + }, + { + "epoch": 3.1445686900958467, + "grad_norm": 0.10524406284093857, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3937 + }, + { + "epoch": 3.1453674121405752, + "grad_norm": 0.03763152286410332, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3938 + }, + { + "epoch": 3.1461661341853033, + "grad_norm": 0.07586465775966644, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 3939 + }, + { + "epoch": 3.146964856230032, + "grad_norm": 0.14553581178188324, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 3940 + }, + { + "epoch": 3.1477635782747604, + "grad_norm": 0.1883595883846283, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 3941 + }, + { + "epoch": 3.148562300319489, + "grad_norm": 0.13018599152565002, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 3942 + }, + { + "epoch": 3.1493610223642174, + "grad_norm": 0.05356704071164131, + "learning_rate": 0.0005, + "loss": 1.0413, + "step": 3943 + }, + { + "epoch": 3.1501597444089455, + "grad_norm": 0.2083088606595993, + "learning_rate": 0.0005, + "loss": 1.0405, + "step": 3944 + }, + { + "epoch": 3.150958466453674, + "grad_norm": 0.2586681544780731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3945 + }, + { + "epoch": 3.1517571884984026, + "grad_norm": 0.18733063340187073, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 3946 + }, + { + "epoch": 3.152555910543131, + "grad_norm": 0.03741752356290817, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 3947 + }, + { + "epoch": 3.1533546325878596, + "grad_norm": 0.11660216003656387, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 3948 + }, + { + "epoch": 3.1541533546325877, + "grad_norm": 0.12698383629322052, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 3949 + }, + { + "epoch": 3.1549520766773163, + "grad_norm": 0.10244922339916229, + "learning_rate": 0.0005, + "loss": 1.037, + "step": 3950 + }, + { + "epoch": 3.155750798722045, + "grad_norm": 0.03815237060189247, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3951 + }, + { + "epoch": 3.1565495207667733, + "grad_norm": 0.04394761845469475, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 3952 + }, + { + "epoch": 3.1573482428115014, + "grad_norm": 0.1344541311264038, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 3953 + }, + { + "epoch": 3.15814696485623, + "grad_norm": 0.23006947338581085, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 3954 + }, + { + "epoch": 3.1589456869009584, + "grad_norm": 0.2667021155357361, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3955 + }, + { + "epoch": 3.159744408945687, + "grad_norm": 0.2410362809896469, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3956 + }, + { + "epoch": 3.1605431309904155, + "grad_norm": 0.1421661078929901, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3957 + }, + { + "epoch": 3.1613418530351436, + "grad_norm": 0.04178561642765999, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 3958 + }, + { + "epoch": 3.162140575079872, + "grad_norm": 0.15327088534832, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3959 + }, + { + "epoch": 3.1629392971246006, + "grad_norm": 0.1372532993555069, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3960 + }, + { + "epoch": 3.163738019169329, + "grad_norm": 0.03763817250728607, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3961 + }, + { + "epoch": 3.1645367412140577, + "grad_norm": 0.13227587938308716, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 3962 + }, + { + "epoch": 3.165335463258786, + "grad_norm": 0.1952073723077774, + "learning_rate": 0.0005, + "loss": 1.0395, + "step": 3963 + }, + { + "epoch": 3.1661341853035143, + "grad_norm": 0.1672048568725586, + "learning_rate": 0.0005, + "loss": 1.0412, + "step": 3964 + }, + { + "epoch": 3.166932907348243, + "grad_norm": 0.09593698382377625, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 3965 + }, + { + "epoch": 3.1677316293929714, + "grad_norm": 0.03619454428553581, + "learning_rate": 0.0005, + "loss": 1.0385, + "step": 3966 + }, + { + "epoch": 3.1685303514377, + "grad_norm": 0.05974683538079262, + "learning_rate": 0.0005, + "loss": 1.0417, + "step": 3967 + }, + { + "epoch": 3.169329073482428, + "grad_norm": 0.09733424335718155, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3968 + }, + { + "epoch": 3.1701277955271565, + "grad_norm": 0.07536087185144424, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 3969 + }, + { + "epoch": 3.170926517571885, + "grad_norm": 0.04263869300484657, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 3970 + }, + { + "epoch": 3.1717252396166136, + "grad_norm": 0.040521468967199326, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 3971 + }, + { + "epoch": 3.1725239616613417, + "grad_norm": 0.05615096539258957, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 3972 + }, + { + "epoch": 3.17332268370607, + "grad_norm": 0.06655194610357285, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 3973 + }, + { + "epoch": 3.1741214057507987, + "grad_norm": 0.07300302386283875, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 3974 + }, + { + "epoch": 3.1749201277955272, + "grad_norm": 0.04789174720644951, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 3975 + }, + { + "epoch": 3.1757188498402558, + "grad_norm": 0.03460157290101051, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 3976 + }, + { + "epoch": 3.176517571884984, + "grad_norm": 0.0393557995557785, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 3977 + }, + { + "epoch": 3.1773162939297124, + "grad_norm": 0.062453389167785645, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 3978 + }, + { + "epoch": 3.178115015974441, + "grad_norm": 0.08542043715715408, + "learning_rate": 0.0005, + "loss": 1.0388, + "step": 3979 + }, + { + "epoch": 3.1789137380191694, + "grad_norm": 0.08002828061580658, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 3980 + }, + { + "epoch": 3.179712460063898, + "grad_norm": 0.04635196551680565, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3981 + }, + { + "epoch": 3.180511182108626, + "grad_norm": 0.09583642333745956, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 3982 + }, + { + "epoch": 3.1813099041533546, + "grad_norm": 0.12418454885482788, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 3983 + }, + { + "epoch": 3.182108626198083, + "grad_norm": 0.10457618534564972, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 3984 + }, + { + "epoch": 3.1829073482428116, + "grad_norm": 0.07183804363012314, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 3985 + }, + { + "epoch": 3.18370607028754, + "grad_norm": 0.039956409484148026, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 3986 + }, + { + "epoch": 3.1845047923322682, + "grad_norm": 0.0884016826748848, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 3987 + }, + { + "epoch": 3.1853035143769968, + "grad_norm": 0.112494558095932, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 3988 + }, + { + "epoch": 3.1861022364217253, + "grad_norm": 0.07582054287195206, + "learning_rate": 0.0005, + "loss": 1.0404, + "step": 3989 + }, + { + "epoch": 3.186900958466454, + "grad_norm": 0.060303278267383575, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 3990 + }, + { + "epoch": 3.187699680511182, + "grad_norm": 0.048326775431632996, + "learning_rate": 0.0005, + "loss": 1.0425, + "step": 3991 + }, + { + "epoch": 3.1884984025559104, + "grad_norm": 0.32322436571121216, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 3992 + }, + { + "epoch": 3.189297124600639, + "grad_norm": 0.5569815039634705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 3993 + }, + { + "epoch": 3.1900958466453675, + "grad_norm": 0.7590563893318176, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 3994 + }, + { + "epoch": 3.190894568690096, + "grad_norm": 0.6537879705429077, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 3995 + }, + { + "epoch": 3.191693290734824, + "grad_norm": 0.16556645929813385, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 3996 + }, + { + "epoch": 3.1924920127795526, + "grad_norm": 0.3745940625667572, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 3997 + }, + { + "epoch": 3.193290734824281, + "grad_norm": 0.5159009695053101, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 3998 + }, + { + "epoch": 3.1940894568690097, + "grad_norm": 0.1302756816148758, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 3999 + }, + { + "epoch": 3.194888178913738, + "grad_norm": 0.3484213054180145, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4000 + }, + { + "epoch": 3.1956869009584663, + "grad_norm": 0.23763029277324677, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4001 + }, + { + "epoch": 3.196485623003195, + "grad_norm": 0.20648746192455292, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4002 + }, + { + "epoch": 3.1972843450479234, + "grad_norm": 0.31230399012565613, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4003 + }, + { + "epoch": 3.198083067092652, + "grad_norm": 0.15389247238636017, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4004 + }, + { + "epoch": 3.1988817891373804, + "grad_norm": 0.6544334292411804, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4005 + }, + { + "epoch": 3.1996805111821085, + "grad_norm": 0.5409669280052185, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4006 + }, + { + "epoch": 3.200479233226837, + "grad_norm": 0.11126074194908142, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4007 + }, + { + "epoch": 3.2012779552715656, + "grad_norm": 0.3257724642753601, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4008 + }, + { + "epoch": 3.202076677316294, + "grad_norm": 0.4188903272151947, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4009 + }, + { + "epoch": 3.202875399361022, + "grad_norm": 0.1012830138206482, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4010 + }, + { + "epoch": 3.2036741214057507, + "grad_norm": 0.2771216034889221, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4011 + }, + { + "epoch": 3.2044728434504792, + "grad_norm": 0.2873278260231018, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4012 + }, + { + "epoch": 3.2052715654952078, + "grad_norm": 0.09620041400194168, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4013 + }, + { + "epoch": 3.2060702875399363, + "grad_norm": 0.10561787337064743, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4014 + }, + { + "epoch": 3.2068690095846644, + "grad_norm": 0.12499046325683594, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4015 + }, + { + "epoch": 3.207667731629393, + "grad_norm": 0.4055064916610718, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4016 + }, + { + "epoch": 3.2084664536741214, + "grad_norm": 0.9722099900245667, + "learning_rate": 0.0005, + "loss": 1.0815, + "step": 4017 + }, + { + "epoch": 3.20926517571885, + "grad_norm": 0.7367122173309326, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 4018 + }, + { + "epoch": 3.2100638977635785, + "grad_norm": 0.4455755650997162, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4019 + }, + { + "epoch": 3.2108626198083066, + "grad_norm": 0.10350961983203888, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4020 + }, + { + "epoch": 3.211661341853035, + "grad_norm": 0.41901662945747375, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 4021 + }, + { + "epoch": 3.2124600638977636, + "grad_norm": 0.5987749695777893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4022 + }, + { + "epoch": 3.213258785942492, + "grad_norm": 1.5967272520065308, + "learning_rate": 0.0005, + "loss": 1.1938, + "step": 4023 + }, + { + "epoch": 3.2140575079872207, + "grad_norm": 3.289113759994507, + "learning_rate": 0.0005, + "loss": 1.2474, + "step": 4024 + }, + { + "epoch": 3.2148562300319488, + "grad_norm": 0.40220701694488525, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 4025 + }, + { + "epoch": 3.2156549520766773, + "grad_norm": 0.15129008889198303, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4026 + }, + { + "epoch": 3.216453674121406, + "grad_norm": 19.060272216796875, + "learning_rate": 0.0005, + "loss": 1.4668, + "step": 4027 + }, + { + "epoch": 3.2172523961661343, + "grad_norm": 1.72987961769104, + "learning_rate": 0.0005, + "loss": 1.3675, + "step": 4028 + }, + { + "epoch": 3.2180511182108624, + "grad_norm": 2.1064836978912354, + "learning_rate": 0.0005, + "loss": 1.5454, + "step": 4029 + }, + { + "epoch": 3.218849840255591, + "grad_norm": 1.0206952095031738, + "learning_rate": 0.0005, + "loss": 1.2602, + "step": 4030 + }, + { + "epoch": 3.2196485623003195, + "grad_norm": 14.109564781188965, + "learning_rate": 0.0005, + "loss": 3.9831, + "step": 4031 + }, + { + "epoch": 3.220447284345048, + "grad_norm": 12.518637657165527, + "learning_rate": 0.0005, + "loss": 3.4388, + "step": 4032 + }, + { + "epoch": 3.2212460063897765, + "grad_norm": 4.156238079071045, + "learning_rate": 0.0005, + "loss": 2.1713, + "step": 4033 + }, + { + "epoch": 3.2220447284345046, + "grad_norm": 2.752128839492798, + "learning_rate": 0.0005, + "loss": 1.6581, + "step": 4034 + }, + { + "epoch": 3.222843450479233, + "grad_norm": 5.876696586608887, + "learning_rate": 0.0005, + "loss": 2.1698, + "step": 4035 + }, + { + "epoch": 3.2236421725239617, + "grad_norm": 7.60305118560791, + "learning_rate": 0.0005, + "loss": 3.0713, + "step": 4036 + }, + { + "epoch": 3.22444089456869, + "grad_norm": 2.581448554992676, + "learning_rate": 0.0005, + "loss": 1.7677, + "step": 4037 + }, + { + "epoch": 3.2252396166134187, + "grad_norm": 1.0544116497039795, + "learning_rate": 0.0005, + "loss": 1.4604, + "step": 4038 + }, + { + "epoch": 3.226038338658147, + "grad_norm": 10.742961883544922, + "learning_rate": 0.0005, + "loss": 3.8634, + "step": 4039 + }, + { + "epoch": 3.2268370607028753, + "grad_norm": 6.555435657501221, + "learning_rate": 0.0005, + "loss": 2.7229, + "step": 4040 + }, + { + "epoch": 3.227635782747604, + "grad_norm": 4.335379600524902, + "learning_rate": 0.0005, + "loss": 2.548, + "step": 4041 + }, + { + "epoch": 3.2284345047923324, + "grad_norm": 3.9863200187683105, + "learning_rate": 0.0005, + "loss": 2.5051, + "step": 4042 + }, + { + "epoch": 3.229233226837061, + "grad_norm": 3.4922895431518555, + "learning_rate": 0.0005, + "loss": 2.1996, + "step": 4043 + }, + { + "epoch": 3.230031948881789, + "grad_norm": 0.9404768347740173, + "learning_rate": 0.0005, + "loss": 1.7869, + "step": 4044 + }, + { + "epoch": 3.2308306709265175, + "grad_norm": 1.2953938245773315, + "learning_rate": 0.0005, + "loss": 1.7473, + "step": 4045 + }, + { + "epoch": 3.231629392971246, + "grad_norm": 2.0215165615081787, + "learning_rate": 0.0005, + "loss": 1.9429, + "step": 4046 + }, + { + "epoch": 3.2324281150159746, + "grad_norm": 1.2744032144546509, + "learning_rate": 0.0005, + "loss": 1.6993, + "step": 4047 + }, + { + "epoch": 3.2332268370607027, + "grad_norm": 2.042656660079956, + "learning_rate": 0.0005, + "loss": 1.6652, + "step": 4048 + }, + { + "epoch": 3.234025559105431, + "grad_norm": 6.607172012329102, + "learning_rate": 0.0005, + "loss": 2.8381, + "step": 4049 + }, + { + "epoch": 3.2348242811501597, + "grad_norm": 1.2499932050704956, + "learning_rate": 0.0005, + "loss": 1.6324, + "step": 4050 + }, + { + "epoch": 3.2356230031948883, + "grad_norm": 1.1896424293518066, + "learning_rate": 0.0005, + "loss": 1.7201, + "step": 4051 + }, + { + "epoch": 3.236421725239617, + "grad_norm": 1.9901418685913086, + "learning_rate": 0.0005, + "loss": 1.7335, + "step": 4052 + }, + { + "epoch": 3.237220447284345, + "grad_norm": 0.8886330127716064, + "learning_rate": 0.0005, + "loss": 1.5111, + "step": 4053 + }, + { + "epoch": 3.2380191693290734, + "grad_norm": 2.6570353507995605, + "learning_rate": 0.0005, + "loss": 1.8628, + "step": 4054 + }, + { + "epoch": 3.238817891373802, + "grad_norm": 2.212905168533325, + "learning_rate": 0.0005, + "loss": 1.5838, + "step": 4055 + }, + { + "epoch": 3.2396166134185305, + "grad_norm": 3.1234660148620605, + "learning_rate": 0.0005, + "loss": 1.7212, + "step": 4056 + }, + { + "epoch": 3.2404153354632586, + "grad_norm": 0.9168338775634766, + "learning_rate": 0.0005, + "loss": 1.5113, + "step": 4057 + }, + { + "epoch": 3.241214057507987, + "grad_norm": 0.8366042971611023, + "learning_rate": 0.0005, + "loss": 1.4997, + "step": 4058 + }, + { + "epoch": 3.2420127795527156, + "grad_norm": 0.5359059572219849, + "learning_rate": 0.0005, + "loss": 1.4185, + "step": 4059 + }, + { + "epoch": 3.242811501597444, + "grad_norm": 1.8511804342269897, + "learning_rate": 0.0005, + "loss": 1.4214, + "step": 4060 + }, + { + "epoch": 3.2436102236421727, + "grad_norm": 1.3229485750198364, + "learning_rate": 0.0005, + "loss": 1.4497, + "step": 4061 + }, + { + "epoch": 3.244408945686901, + "grad_norm": 0.8846393823623657, + "learning_rate": 0.0005, + "loss": 1.384, + "step": 4062 + }, + { + "epoch": 3.2452076677316293, + "grad_norm": 1.1345176696777344, + "learning_rate": 0.0005, + "loss": 1.3906, + "step": 4063 + }, + { + "epoch": 3.246006389776358, + "grad_norm": 0.998261034488678, + "learning_rate": 0.0005, + "loss": 1.3807, + "step": 4064 + }, + { + "epoch": 3.2468051118210863, + "grad_norm": 0.8998358249664307, + "learning_rate": 0.0005, + "loss": 1.3321, + "step": 4065 + }, + { + "epoch": 3.247603833865815, + "grad_norm": 0.6892838478088379, + "learning_rate": 0.0005, + "loss": 1.3718, + "step": 4066 + }, + { + "epoch": 3.248402555910543, + "grad_norm": 0.515389084815979, + "learning_rate": 0.0005, + "loss": 1.3296, + "step": 4067 + }, + { + "epoch": 3.2492012779552715, + "grad_norm": 0.41038376092910767, + "learning_rate": 0.0005, + "loss": 1.2855, + "step": 4068 + }, + { + "epoch": 3.25, + "grad_norm": 0.6094494462013245, + "learning_rate": 0.0005, + "loss": 1.2953, + "step": 4069 + }, + { + "epoch": 3.2507987220447285, + "grad_norm": 0.6274027228355408, + "learning_rate": 0.0005, + "loss": 1.2879, + "step": 4070 + }, + { + "epoch": 3.251597444089457, + "grad_norm": 0.8833006024360657, + "learning_rate": 0.0005, + "loss": 1.2806, + "step": 4071 + }, + { + "epoch": 3.252396166134185, + "grad_norm": 0.8688742518424988, + "learning_rate": 0.0005, + "loss": 1.2845, + "step": 4072 + }, + { + "epoch": 3.2531948881789137, + "grad_norm": 0.34751075506210327, + "learning_rate": 0.0005, + "loss": 1.2592, + "step": 4073 + }, + { + "epoch": 3.253993610223642, + "grad_norm": 0.4245823621749878, + "learning_rate": 0.0005, + "loss": 1.2529, + "step": 4074 + }, + { + "epoch": 3.2547923322683707, + "grad_norm": 0.4495961368083954, + "learning_rate": 0.0005, + "loss": 1.2438, + "step": 4075 + }, + { + "epoch": 3.255591054313099, + "grad_norm": 0.683125913143158, + "learning_rate": 0.0005, + "loss": 1.2297, + "step": 4076 + }, + { + "epoch": 3.2563897763578273, + "grad_norm": 0.4342438876628876, + "learning_rate": 0.0005, + "loss": 1.2421, + "step": 4077 + }, + { + "epoch": 3.257188498402556, + "grad_norm": 0.2018793523311615, + "learning_rate": 0.0005, + "loss": 1.2313, + "step": 4078 + }, + { + "epoch": 3.2579872204472844, + "grad_norm": 0.26145434379577637, + "learning_rate": 0.0005, + "loss": 1.218, + "step": 4079 + }, + { + "epoch": 3.258785942492013, + "grad_norm": 0.16941657662391663, + "learning_rate": 0.0005, + "loss": 1.2211, + "step": 4080 + }, + { + "epoch": 3.2595846645367414, + "grad_norm": 0.3158339262008667, + "learning_rate": 0.0005, + "loss": 1.2192, + "step": 4081 + }, + { + "epoch": 3.2603833865814695, + "grad_norm": 0.18630816042423248, + "learning_rate": 0.0005, + "loss": 1.2091, + "step": 4082 + }, + { + "epoch": 3.261182108626198, + "grad_norm": 0.19504855573177338, + "learning_rate": 0.0005, + "loss": 1.2047, + "step": 4083 + }, + { + "epoch": 3.2619808306709266, + "grad_norm": 0.19672146439552307, + "learning_rate": 0.0005, + "loss": 1.2022, + "step": 4084 + }, + { + "epoch": 3.262779552715655, + "grad_norm": 0.15959087014198303, + "learning_rate": 0.0005, + "loss": 1.1957, + "step": 4085 + }, + { + "epoch": 3.263578274760383, + "grad_norm": 0.18326745927333832, + "learning_rate": 0.0005, + "loss": 1.1835, + "step": 4086 + }, + { + "epoch": 3.2643769968051117, + "grad_norm": 0.23495830595493317, + "learning_rate": 0.0005, + "loss": 1.1837, + "step": 4087 + }, + { + "epoch": 3.2651757188498403, + "grad_norm": 0.22718247771263123, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 4088 + }, + { + "epoch": 3.265974440894569, + "grad_norm": 0.2913427948951721, + "learning_rate": 0.0005, + "loss": 1.1759, + "step": 4089 + }, + { + "epoch": 3.2667731629392973, + "grad_norm": 0.44531312584877014, + "learning_rate": 0.0005, + "loss": 1.1751, + "step": 4090 + }, + { + "epoch": 3.2675718849840254, + "grad_norm": 0.6265004277229309, + "learning_rate": 0.0005, + "loss": 1.184, + "step": 4091 + }, + { + "epoch": 3.268370607028754, + "grad_norm": 0.6119574904441833, + "learning_rate": 0.0005, + "loss": 1.186, + "step": 4092 + }, + { + "epoch": 3.2691693290734825, + "grad_norm": 0.23989497125148773, + "learning_rate": 0.0005, + "loss": 1.1634, + "step": 4093 + }, + { + "epoch": 3.269968051118211, + "grad_norm": 0.266013503074646, + "learning_rate": 0.0005, + "loss": 1.1693, + "step": 4094 + }, + { + "epoch": 3.270766773162939, + "grad_norm": 0.2205667793750763, + "learning_rate": 0.0005, + "loss": 1.1627, + "step": 4095 + }, + { + "epoch": 3.2715654952076676, + "grad_norm": 0.4600715935230255, + "learning_rate": 0.0005, + "loss": 1.1566, + "step": 4096 + }, + { + "epoch": 3.272364217252396, + "grad_norm": 0.6725661754608154, + "learning_rate": 0.0005, + "loss": 1.1806, + "step": 4097 + }, + { + "epoch": 3.2731629392971247, + "grad_norm": 0.3836606442928314, + "learning_rate": 0.0005, + "loss": 1.1613, + "step": 4098 + }, + { + "epoch": 3.273961661341853, + "grad_norm": 0.3752588629722595, + "learning_rate": 0.0005, + "loss": 1.1639, + "step": 4099 + }, + { + "epoch": 3.2747603833865817, + "grad_norm": 0.3297381103038788, + "learning_rate": 0.0005, + "loss": 1.1488, + "step": 4100 + }, + { + "epoch": 3.27555910543131, + "grad_norm": 0.5899438858032227, + "learning_rate": 0.0005, + "loss": 1.1486, + "step": 4101 + }, + { + "epoch": 3.2763578274760383, + "grad_norm": 0.5899466872215271, + "learning_rate": 0.0005, + "loss": 1.1533, + "step": 4102 + }, + { + "epoch": 3.277156549520767, + "grad_norm": 0.2944958209991455, + "learning_rate": 0.0005, + "loss": 1.1517, + "step": 4103 + }, + { + "epoch": 3.2779552715654954, + "grad_norm": 0.5870373249053955, + "learning_rate": 0.0005, + "loss": 1.1484, + "step": 4104 + }, + { + "epoch": 3.2787539936102235, + "grad_norm": 0.25267326831817627, + "learning_rate": 0.0005, + "loss": 1.1351, + "step": 4105 + }, + { + "epoch": 3.279552715654952, + "grad_norm": 0.20602582395076752, + "learning_rate": 0.0005, + "loss": 1.1387, + "step": 4106 + }, + { + "epoch": 3.2803514376996805, + "grad_norm": 0.4151447117328644, + "learning_rate": 0.0005, + "loss": 1.1338, + "step": 4107 + }, + { + "epoch": 3.281150159744409, + "grad_norm": 0.6591519117355347, + "learning_rate": 0.0005, + "loss": 1.1395, + "step": 4108 + }, + { + "epoch": 3.2819488817891376, + "grad_norm": 0.48510807752609253, + "learning_rate": 0.0005, + "loss": 1.1496, + "step": 4109 + }, + { + "epoch": 3.2827476038338657, + "grad_norm": 0.27803128957748413, + "learning_rate": 0.0005, + "loss": 1.139, + "step": 4110 + }, + { + "epoch": 3.283546325878594, + "grad_norm": 0.3939184546470642, + "learning_rate": 0.0005, + "loss": 1.141, + "step": 4111 + }, + { + "epoch": 3.2843450479233227, + "grad_norm": 0.18271984159946442, + "learning_rate": 0.0005, + "loss": 1.1284, + "step": 4112 + }, + { + "epoch": 3.2851437699680512, + "grad_norm": 0.19690747559070587, + "learning_rate": 0.0005, + "loss": 1.1286, + "step": 4113 + }, + { + "epoch": 3.2859424920127793, + "grad_norm": 0.22968755662441254, + "learning_rate": 0.0005, + "loss": 1.1316, + "step": 4114 + }, + { + "epoch": 3.286741214057508, + "grad_norm": 0.24908174574375153, + "learning_rate": 0.0005, + "loss": 1.1279, + "step": 4115 + }, + { + "epoch": 3.2875399361022364, + "grad_norm": 0.15813285112380981, + "learning_rate": 0.0005, + "loss": 1.1172, + "step": 4116 + }, + { + "epoch": 3.288338658146965, + "grad_norm": 0.1056000292301178, + "learning_rate": 0.0005, + "loss": 1.1143, + "step": 4117 + }, + { + "epoch": 3.2891373801916934, + "grad_norm": 0.19983351230621338, + "learning_rate": 0.0005, + "loss": 1.118, + "step": 4118 + }, + { + "epoch": 3.289936102236422, + "grad_norm": 0.13660027086734772, + "learning_rate": 0.0005, + "loss": 1.1124, + "step": 4119 + }, + { + "epoch": 3.29073482428115, + "grad_norm": 0.15008457005023956, + "learning_rate": 0.0005, + "loss": 1.1153, + "step": 4120 + }, + { + "epoch": 3.2915335463258786, + "grad_norm": 0.1475287824869156, + "learning_rate": 0.0005, + "loss": 1.105, + "step": 4121 + }, + { + "epoch": 3.292332268370607, + "grad_norm": 0.10478811711072922, + "learning_rate": 0.0005, + "loss": 1.1163, + "step": 4122 + }, + { + "epoch": 3.2931309904153356, + "grad_norm": 0.1577034890651703, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 4123 + }, + { + "epoch": 3.2939297124600637, + "grad_norm": 0.1019970178604126, + "learning_rate": 0.0005, + "loss": 1.1117, + "step": 4124 + }, + { + "epoch": 3.2947284345047922, + "grad_norm": 0.09229713678359985, + "learning_rate": 0.0005, + "loss": 1.0967, + "step": 4125 + }, + { + "epoch": 3.2955271565495208, + "grad_norm": 0.10029986500740051, + "learning_rate": 0.0005, + "loss": 1.111, + "step": 4126 + }, + { + "epoch": 3.2963258785942493, + "grad_norm": 0.14171569049358368, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4127 + }, + { + "epoch": 3.297124600638978, + "grad_norm": 0.17343609035015106, + "learning_rate": 0.0005, + "loss": 1.0991, + "step": 4128 + }, + { + "epoch": 3.297923322683706, + "grad_norm": 0.2738705277442932, + "learning_rate": 0.0005, + "loss": 1.1074, + "step": 4129 + }, + { + "epoch": 3.2987220447284344, + "grad_norm": 0.3518083691596985, + "learning_rate": 0.0005, + "loss": 1.106, + "step": 4130 + }, + { + "epoch": 3.299520766773163, + "grad_norm": 0.16174353659152985, + "learning_rate": 0.0005, + "loss": 1.0956, + "step": 4131 + }, + { + "epoch": 3.3003194888178915, + "grad_norm": 0.24402645230293274, + "learning_rate": 0.0005, + "loss": 1.1035, + "step": 4132 + }, + { + "epoch": 3.3011182108626196, + "grad_norm": 0.23362669348716736, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 4133 + }, + { + "epoch": 3.301916932907348, + "grad_norm": 0.1391523778438568, + "learning_rate": 0.0005, + "loss": 1.0954, + "step": 4134 + }, + { + "epoch": 3.3027156549520766, + "grad_norm": 0.1516295224428177, + "learning_rate": 0.0005, + "loss": 1.0968, + "step": 4135 + }, + { + "epoch": 3.303514376996805, + "grad_norm": 0.17463526129722595, + "learning_rate": 0.0005, + "loss": 1.104, + "step": 4136 + }, + { + "epoch": 3.3043130990415337, + "grad_norm": 0.13717398047447205, + "learning_rate": 0.0005, + "loss": 1.1003, + "step": 4137 + }, + { + "epoch": 3.3051118210862622, + "grad_norm": 0.16802728176116943, + "learning_rate": 0.0005, + "loss": 1.0909, + "step": 4138 + }, + { + "epoch": 3.3059105431309903, + "grad_norm": 0.11959057301282883, + "learning_rate": 0.0005, + "loss": 1.0892, + "step": 4139 + }, + { + "epoch": 3.306709265175719, + "grad_norm": 0.07706355303525925, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4140 + }, + { + "epoch": 3.3075079872204474, + "grad_norm": 0.07729125767946243, + "learning_rate": 0.0005, + "loss": 1.0984, + "step": 4141 + }, + { + "epoch": 3.308306709265176, + "grad_norm": 0.08654871582984924, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 4142 + }, + { + "epoch": 3.309105431309904, + "grad_norm": 0.11485479772090912, + "learning_rate": 0.0005, + "loss": 1.0902, + "step": 4143 + }, + { + "epoch": 3.3099041533546325, + "grad_norm": 0.10812658816576004, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4144 + }, + { + "epoch": 3.310702875399361, + "grad_norm": 0.08537860214710236, + "learning_rate": 0.0005, + "loss": 1.0826, + "step": 4145 + }, + { + "epoch": 3.3115015974440896, + "grad_norm": 0.10628878325223923, + "learning_rate": 0.0005, + "loss": 1.0878, + "step": 4146 + }, + { + "epoch": 3.312300319488818, + "grad_norm": 0.14903275668621063, + "learning_rate": 0.0005, + "loss": 1.0831, + "step": 4147 + }, + { + "epoch": 3.313099041533546, + "grad_norm": 0.09670894593000412, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4148 + }, + { + "epoch": 3.3138977635782747, + "grad_norm": 0.10959025472402573, + "learning_rate": 0.0005, + "loss": 1.094, + "step": 4149 + }, + { + "epoch": 3.3146964856230032, + "grad_norm": 0.10397703945636749, + "learning_rate": 0.0005, + "loss": 1.0885, + "step": 4150 + }, + { + "epoch": 3.3154952076677318, + "grad_norm": 0.07681623846292496, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 4151 + }, + { + "epoch": 3.31629392971246, + "grad_norm": 0.07938152551651001, + "learning_rate": 0.0005, + "loss": 1.0894, + "step": 4152 + }, + { + "epoch": 3.3170926517571884, + "grad_norm": 0.14678052067756653, + "learning_rate": 0.0005, + "loss": 1.0944, + "step": 4153 + }, + { + "epoch": 3.317891373801917, + "grad_norm": 0.15366105735301971, + "learning_rate": 0.0005, + "loss": 1.085, + "step": 4154 + }, + { + "epoch": 3.3186900958466454, + "grad_norm": 0.13449597358703613, + "learning_rate": 0.0005, + "loss": 1.0837, + "step": 4155 + }, + { + "epoch": 3.319488817891374, + "grad_norm": 0.0861068144440651, + "learning_rate": 0.0005, + "loss": 1.0857, + "step": 4156 + }, + { + "epoch": 3.3202875399361025, + "grad_norm": 0.0604286752641201, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 4157 + }, + { + "epoch": 3.3210862619808306, + "grad_norm": 0.08299542963504791, + "learning_rate": 0.0005, + "loss": 1.0865, + "step": 4158 + }, + { + "epoch": 3.321884984025559, + "grad_norm": 0.0738200917840004, + "learning_rate": 0.0005, + "loss": 1.0808, + "step": 4159 + }, + { + "epoch": 3.3226837060702876, + "grad_norm": 0.06450676172971725, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4160 + }, + { + "epoch": 3.323482428115016, + "grad_norm": 0.055281370878219604, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4161 + }, + { + "epoch": 3.3242811501597442, + "grad_norm": 0.09895910322666168, + "learning_rate": 0.0005, + "loss": 1.089, + "step": 4162 + }, + { + "epoch": 3.3250798722044728, + "grad_norm": 0.10338333994150162, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 4163 + }, + { + "epoch": 3.3258785942492013, + "grad_norm": 0.08346354216337204, + "learning_rate": 0.0005, + "loss": 1.0793, + "step": 4164 + }, + { + "epoch": 3.32667731629393, + "grad_norm": 0.15257857739925385, + "learning_rate": 0.0005, + "loss": 1.0853, + "step": 4165 + }, + { + "epoch": 3.3274760383386583, + "grad_norm": 0.1782383918762207, + "learning_rate": 0.0005, + "loss": 1.0848, + "step": 4166 + }, + { + "epoch": 3.3282747603833864, + "grad_norm": 0.09908363968133926, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4167 + }, + { + "epoch": 3.329073482428115, + "grad_norm": 0.18942143023014069, + "learning_rate": 0.0005, + "loss": 1.076, + "step": 4168 + }, + { + "epoch": 3.3298722044728435, + "grad_norm": 0.21095149219036102, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 4169 + }, + { + "epoch": 3.330670926517572, + "grad_norm": 0.11597894132137299, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 4170 + }, + { + "epoch": 3.3314696485623, + "grad_norm": 0.20450811088085175, + "learning_rate": 0.0005, + "loss": 1.08, + "step": 4171 + }, + { + "epoch": 3.3322683706070286, + "grad_norm": 0.1609300971031189, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4172 + }, + { + "epoch": 3.333067092651757, + "grad_norm": 0.14068877696990967, + "learning_rate": 0.0005, + "loss": 1.0835, + "step": 4173 + }, + { + "epoch": 3.3338658146964857, + "grad_norm": 0.11969266831874847, + "learning_rate": 0.0005, + "loss": 1.0818, + "step": 4174 + }, + { + "epoch": 3.334664536741214, + "grad_norm": 0.16986626386642456, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 4175 + }, + { + "epoch": 3.3354632587859427, + "grad_norm": 0.2065591812133789, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4176 + }, + { + "epoch": 3.336261980830671, + "grad_norm": 0.23542748391628265, + "learning_rate": 0.0005, + "loss": 1.0916, + "step": 4177 + }, + { + "epoch": 3.3370607028753994, + "grad_norm": 0.20896919071674347, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 4178 + }, + { + "epoch": 3.337859424920128, + "grad_norm": 0.16446076333522797, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 4179 + }, + { + "epoch": 3.3386581469648564, + "grad_norm": 0.11143177002668381, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 4180 + }, + { + "epoch": 3.3394568690095845, + "grad_norm": 0.0866970345377922, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 4181 + }, + { + "epoch": 3.340255591054313, + "grad_norm": 0.14608244597911835, + "learning_rate": 0.0005, + "loss": 1.0849, + "step": 4182 + }, + { + "epoch": 3.3410543130990416, + "grad_norm": 0.06152384728193283, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 4183 + }, + { + "epoch": 3.34185303514377, + "grad_norm": 0.14289656281471252, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4184 + }, + { + "epoch": 3.3426517571884986, + "grad_norm": 0.16735558211803436, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4185 + }, + { + "epoch": 3.3434504792332267, + "grad_norm": 0.09012678265571594, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 4186 + }, + { + "epoch": 3.344249201277955, + "grad_norm": 0.05861378088593483, + "learning_rate": 0.0005, + "loss": 1.0778, + "step": 4187 + }, + { + "epoch": 3.3450479233226837, + "grad_norm": 0.07123090326786041, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 4188 + }, + { + "epoch": 3.3458466453674123, + "grad_norm": 0.07879375666379929, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4189 + }, + { + "epoch": 3.3466453674121404, + "grad_norm": 0.0925324484705925, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 4190 + }, + { + "epoch": 3.347444089456869, + "grad_norm": 0.0686444416642189, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4191 + }, + { + "epoch": 3.3482428115015974, + "grad_norm": 0.08633724600076675, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4192 + }, + { + "epoch": 3.349041533546326, + "grad_norm": 0.056881021708250046, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4193 + }, + { + "epoch": 3.3498402555910545, + "grad_norm": 0.07752947509288788, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 4194 + }, + { + "epoch": 3.3506389776357826, + "grad_norm": 0.0927717313170433, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 4195 + }, + { + "epoch": 3.351437699680511, + "grad_norm": 0.09599179029464722, + "learning_rate": 0.0005, + "loss": 1.0764, + "step": 4196 + }, + { + "epoch": 3.3522364217252396, + "grad_norm": 0.09090889245271683, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 4197 + }, + { + "epoch": 3.353035143769968, + "grad_norm": 0.12757429480552673, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 4198 + }, + { + "epoch": 3.3538338658146967, + "grad_norm": 0.15210460126399994, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4199 + }, + { + "epoch": 3.3546325878594248, + "grad_norm": 0.10982836782932281, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 4200 + }, + { + "epoch": 3.3554313099041533, + "grad_norm": 0.056641776114702225, + "learning_rate": 0.0005, + "loss": 1.0783, + "step": 4201 + }, + { + "epoch": 3.356230031948882, + "grad_norm": 0.09506776928901672, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4202 + }, + { + "epoch": 3.3570287539936103, + "grad_norm": 0.12064918130636215, + "learning_rate": 0.0005, + "loss": 1.0724, + "step": 4203 + }, + { + "epoch": 3.357827476038339, + "grad_norm": 0.12343298643827438, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 4204 + }, + { + "epoch": 3.358626198083067, + "grad_norm": 0.11508476734161377, + "learning_rate": 0.0005, + "loss": 1.0781, + "step": 4205 + }, + { + "epoch": 3.3594249201277955, + "grad_norm": 0.07552453875541687, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4206 + }, + { + "epoch": 3.360223642172524, + "grad_norm": 0.10495936870574951, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 4207 + }, + { + "epoch": 3.3610223642172525, + "grad_norm": 0.13230633735656738, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4208 + }, + { + "epoch": 3.3618210862619806, + "grad_norm": 0.13003787398338318, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 4209 + }, + { + "epoch": 3.362619808306709, + "grad_norm": 0.09252234548330307, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 4210 + }, + { + "epoch": 3.3634185303514377, + "grad_norm": 0.07739317417144775, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 4211 + }, + { + "epoch": 3.364217252396166, + "grad_norm": 0.12185318768024445, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4212 + }, + { + "epoch": 3.3650159744408947, + "grad_norm": 0.17643119394779205, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4213 + }, + { + "epoch": 3.365814696485623, + "grad_norm": 0.10462872684001923, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 4214 + }, + { + "epoch": 3.3666134185303513, + "grad_norm": 0.1486569344997406, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 4215 + }, + { + "epoch": 3.36741214057508, + "grad_norm": 0.11858930438756943, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 4216 + }, + { + "epoch": 3.3682108626198084, + "grad_norm": 0.07907772809267044, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 4217 + }, + { + "epoch": 3.369009584664537, + "grad_norm": 0.5416387319564819, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 4218 + }, + { + "epoch": 3.369808306709265, + "grad_norm": 0.08767322450876236, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4219 + }, + { + "epoch": 3.3706070287539935, + "grad_norm": 0.09651107341051102, + "learning_rate": 0.0005, + "loss": 1.0714, + "step": 4220 + }, + { + "epoch": 3.371405750798722, + "grad_norm": 0.07548791915178299, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4221 + }, + { + "epoch": 3.3722044728434506, + "grad_norm": 0.09317605197429657, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4222 + }, + { + "epoch": 3.373003194888179, + "grad_norm": 0.07431582361459732, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4223 + }, + { + "epoch": 3.373801916932907, + "grad_norm": 0.12754018604755402, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 4224 + }, + { + "epoch": 3.3746006389776357, + "grad_norm": 0.12697845697402954, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4225 + }, + { + "epoch": 3.3753993610223643, + "grad_norm": 0.21522995829582214, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 4226 + }, + { + "epoch": 3.376198083067093, + "grad_norm": 0.08886270225048065, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 4227 + }, + { + "epoch": 3.376996805111821, + "grad_norm": 0.07107655704021454, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 4228 + }, + { + "epoch": 3.3777955271565494, + "grad_norm": 0.07452798634767532, + "learning_rate": 0.0005, + "loss": 1.0743, + "step": 4229 + }, + { + "epoch": 3.378594249201278, + "grad_norm": 0.10205573588609695, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4230 + }, + { + "epoch": 3.3793929712460065, + "grad_norm": 0.10990341752767563, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 4231 + }, + { + "epoch": 3.380191693290735, + "grad_norm": 0.08567643165588379, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 4232 + }, + { + "epoch": 3.380990415335463, + "grad_norm": 0.057073548436164856, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 4233 + }, + { + "epoch": 3.3817891373801916, + "grad_norm": 0.12602978944778442, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 4234 + }, + { + "epoch": 3.38258785942492, + "grad_norm": 0.1715400218963623, + "learning_rate": 0.0005, + "loss": 1.0796, + "step": 4235 + }, + { + "epoch": 3.3833865814696487, + "grad_norm": 0.13129903376102448, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 4236 + }, + { + "epoch": 3.384185303514377, + "grad_norm": 0.1308225691318512, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4237 + }, + { + "epoch": 3.3849840255591053, + "grad_norm": 0.1353990137577057, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4238 + }, + { + "epoch": 3.385782747603834, + "grad_norm": 0.08648121356964111, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4239 + }, + { + "epoch": 3.3865814696485623, + "grad_norm": 0.23568236827850342, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4240 + }, + { + "epoch": 3.387380191693291, + "grad_norm": 0.20514735579490662, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 4241 + }, + { + "epoch": 3.3881789137380194, + "grad_norm": 0.10276424884796143, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4242 + }, + { + "epoch": 3.3889776357827475, + "grad_norm": 0.1838751584291458, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 4243 + }, + { + "epoch": 3.389776357827476, + "grad_norm": 0.1697031557559967, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 4244 + }, + { + "epoch": 3.3905750798722045, + "grad_norm": 0.11439084261655807, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4245 + }, + { + "epoch": 3.391373801916933, + "grad_norm": 0.14021249115467072, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4246 + }, + { + "epoch": 3.392172523961661, + "grad_norm": 0.13989558815956116, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 4247 + }, + { + "epoch": 3.3929712460063897, + "grad_norm": 0.12039095908403397, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 4248 + }, + { + "epoch": 3.393769968051118, + "grad_norm": 0.17901045083999634, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 4249 + }, + { + "epoch": 3.3945686900958467, + "grad_norm": 0.1053776666522026, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 4250 + }, + { + "epoch": 3.3953674121405752, + "grad_norm": 1.7777512073516846, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 4251 + }, + { + "epoch": 3.3961661341853033, + "grad_norm": 0.06677904725074768, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 4252 + }, + { + "epoch": 3.396964856230032, + "grad_norm": 0.16123540699481964, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4253 + }, + { + "epoch": 3.3977635782747604, + "grad_norm": 0.21530884504318237, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 4254 + }, + { + "epoch": 3.398562300319489, + "grad_norm": 0.20979386568069458, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 4255 + }, + { + "epoch": 3.3993610223642174, + "grad_norm": 0.14755229651927948, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 4256 + }, + { + "epoch": 3.4001597444089455, + "grad_norm": 0.10182930529117584, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 4257 + }, + { + "epoch": 3.400958466453674, + "grad_norm": 0.11478064954280853, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 4258 + }, + { + "epoch": 3.4017571884984026, + "grad_norm": 0.2052452266216278, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 4259 + }, + { + "epoch": 3.402555910543131, + "grad_norm": 0.6292023062705994, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4260 + }, + { + "epoch": 3.4033546325878596, + "grad_norm": 0.0666726678609848, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4261 + }, + { + "epoch": 3.4041533546325877, + "grad_norm": 0.11848346143960953, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4262 + }, + { + "epoch": 3.4049520766773163, + "grad_norm": 0.15276756882667542, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4263 + }, + { + "epoch": 3.405750798722045, + "grad_norm": 0.08534786105155945, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4264 + }, + { + "epoch": 3.4065495207667733, + "grad_norm": 0.07453266531229019, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4265 + }, + { + "epoch": 3.4073482428115014, + "grad_norm": 0.12894752621650696, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 4266 + }, + { + "epoch": 3.40814696485623, + "grad_norm": 0.11341612786054611, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4267 + }, + { + "epoch": 3.4089456869009584, + "grad_norm": 0.06551265716552734, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 4268 + }, + { + "epoch": 3.409744408945687, + "grad_norm": 0.08828622102737427, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4269 + }, + { + "epoch": 3.4105431309904155, + "grad_norm": 0.06951884925365448, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4270 + }, + { + "epoch": 3.4113418530351436, + "grad_norm": 0.0785432904958725, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 4271 + }, + { + "epoch": 3.412140575079872, + "grad_norm": 0.06681766360998154, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4272 + }, + { + "epoch": 3.4129392971246006, + "grad_norm": 0.060111526399850845, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4273 + }, + { + "epoch": 3.413738019169329, + "grad_norm": 0.07451382279396057, + "learning_rate": 0.0005, + "loss": 1.0712, + "step": 4274 + }, + { + "epoch": 3.4145367412140573, + "grad_norm": 0.08646225184202194, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4275 + }, + { + "epoch": 3.415335463258786, + "grad_norm": 0.07061789929866791, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 4276 + }, + { + "epoch": 3.4161341853035143, + "grad_norm": 0.09554821997880936, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4277 + }, + { + "epoch": 3.416932907348243, + "grad_norm": 0.11288002133369446, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 4278 + }, + { + "epoch": 3.4177316293929714, + "grad_norm": 0.10565607994794846, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 4279 + }, + { + "epoch": 3.4185303514377, + "grad_norm": 0.08235503733158112, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 4280 + }, + { + "epoch": 3.419329073482428, + "grad_norm": 0.1302265226840973, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4281 + }, + { + "epoch": 3.4201277955271565, + "grad_norm": 0.07910848408937454, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 4282 + }, + { + "epoch": 3.420926517571885, + "grad_norm": 0.10624215006828308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4283 + }, + { + "epoch": 3.4217252396166136, + "grad_norm": 0.08545158058404922, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 4284 + }, + { + "epoch": 3.4225239616613417, + "grad_norm": 0.07010428607463837, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4285 + }, + { + "epoch": 3.42332268370607, + "grad_norm": 0.08256867527961731, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4286 + }, + { + "epoch": 3.4241214057507987, + "grad_norm": 0.13074247539043427, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4287 + }, + { + "epoch": 3.4249201277955272, + "grad_norm": 0.18332679569721222, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 4288 + }, + { + "epoch": 3.4257188498402558, + "grad_norm": 0.1671689748764038, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4289 + }, + { + "epoch": 3.426517571884984, + "grad_norm": 0.10386296361684799, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4290 + }, + { + "epoch": 3.4273162939297124, + "grad_norm": 0.07554108649492264, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 4291 + }, + { + "epoch": 3.428115015974441, + "grad_norm": 0.1138196587562561, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4292 + }, + { + "epoch": 3.4289137380191694, + "grad_norm": 0.1681462526321411, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4293 + }, + { + "epoch": 3.4297124600638975, + "grad_norm": 0.1833198368549347, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4294 + }, + { + "epoch": 3.430511182108626, + "grad_norm": 0.10269228368997574, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4295 + }, + { + "epoch": 3.4313099041533546, + "grad_norm": 0.08876223117113113, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4296 + }, + { + "epoch": 3.432108626198083, + "grad_norm": 0.21489253640174866, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4297 + }, + { + "epoch": 3.4329073482428116, + "grad_norm": 0.22669701278209686, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4298 + }, + { + "epoch": 3.43370607028754, + "grad_norm": 0.16946858167648315, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 4299 + }, + { + "epoch": 3.4345047923322682, + "grad_norm": 0.05162649229168892, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4300 + }, + { + "epoch": 3.4353035143769968, + "grad_norm": 0.09700657427310944, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 4301 + }, + { + "epoch": 3.4361022364217253, + "grad_norm": 0.14858263731002808, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 4302 + }, + { + "epoch": 3.436900958466454, + "grad_norm": 0.16938818991184235, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4303 + }, + { + "epoch": 3.437699680511182, + "grad_norm": 0.13441702723503113, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 4304 + }, + { + "epoch": 3.4384984025559104, + "grad_norm": 0.07661818712949753, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4305 + }, + { + "epoch": 3.439297124600639, + "grad_norm": 0.19436489045619965, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4306 + }, + { + "epoch": 3.4400958466453675, + "grad_norm": 0.20447906851768494, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 4307 + }, + { + "epoch": 3.440894568690096, + "grad_norm": 0.1414622664451599, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4308 + }, + { + "epoch": 3.441693290734824, + "grad_norm": 0.06289447098970413, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4309 + }, + { + "epoch": 3.4424920127795526, + "grad_norm": 0.0966482162475586, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4310 + }, + { + "epoch": 3.443290734824281, + "grad_norm": 0.1300116777420044, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 4311 + }, + { + "epoch": 3.4440894568690097, + "grad_norm": 0.11638098210096359, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 4312 + }, + { + "epoch": 3.4448881789137378, + "grad_norm": 0.08284632116556168, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4313 + }, + { + "epoch": 3.4456869009584663, + "grad_norm": 0.0617060512304306, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 4314 + }, + { + "epoch": 3.446485623003195, + "grad_norm": 0.12798283994197845, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 4315 + }, + { + "epoch": 3.4472843450479234, + "grad_norm": 0.12712593376636505, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4316 + }, + { + "epoch": 3.448083067092652, + "grad_norm": 0.09164460003376007, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 4317 + }, + { + "epoch": 3.4488817891373804, + "grad_norm": 0.07618964463472366, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4318 + }, + { + "epoch": 3.4496805111821085, + "grad_norm": 0.07986288517713547, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 4319 + }, + { + "epoch": 3.450479233226837, + "grad_norm": 0.0783228650689125, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4320 + }, + { + "epoch": 3.4512779552715656, + "grad_norm": 0.09899114072322845, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4321 + }, + { + "epoch": 3.452076677316294, + "grad_norm": 0.13710227608680725, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4322 + }, + { + "epoch": 3.452875399361022, + "grad_norm": 0.1281789392232895, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4323 + }, + { + "epoch": 3.4536741214057507, + "grad_norm": 0.11021110415458679, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4324 + }, + { + "epoch": 3.4544728434504792, + "grad_norm": 0.11450989544391632, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4325 + }, + { + "epoch": 3.4552715654952078, + "grad_norm": 0.09010434150695801, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4326 + }, + { + "epoch": 3.4560702875399363, + "grad_norm": 0.08817321807146072, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4327 + }, + { + "epoch": 3.4568690095846644, + "grad_norm": 0.06502921879291534, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4328 + }, + { + "epoch": 3.457667731629393, + "grad_norm": 0.13399769365787506, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 4329 + }, + { + "epoch": 3.4584664536741214, + "grad_norm": 0.19785602390766144, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4330 + }, + { + "epoch": 3.45926517571885, + "grad_norm": 0.15761834383010864, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4331 + }, + { + "epoch": 3.460063897763578, + "grad_norm": 0.11824636161327362, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 4332 + }, + { + "epoch": 3.4608626198083066, + "grad_norm": 0.07031631469726562, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4333 + }, + { + "epoch": 3.461661341853035, + "grad_norm": 0.09940601140260696, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4334 + }, + { + "epoch": 3.4624600638977636, + "grad_norm": 0.11931589990854263, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4335 + }, + { + "epoch": 3.463258785942492, + "grad_norm": 0.0967932790517807, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 4336 + }, + { + "epoch": 3.4640575079872207, + "grad_norm": 0.09523937106132507, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 4337 + }, + { + "epoch": 3.4648562300319488, + "grad_norm": 0.09964902698993683, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4338 + }, + { + "epoch": 3.4656549520766773, + "grad_norm": 0.09898022562265396, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4339 + }, + { + "epoch": 3.466453674121406, + "grad_norm": 0.05388521030545235, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4340 + }, + { + "epoch": 3.4672523961661343, + "grad_norm": 0.06455415487289429, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4341 + }, + { + "epoch": 3.4680511182108624, + "grad_norm": 0.05497310310602188, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4342 + }, + { + "epoch": 3.468849840255591, + "grad_norm": 0.049679841846227646, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4343 + }, + { + "epoch": 3.4696485623003195, + "grad_norm": 0.05664939060807228, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4344 + }, + { + "epoch": 3.470447284345048, + "grad_norm": 0.06651245057582855, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4345 + }, + { + "epoch": 3.4712460063897765, + "grad_norm": 0.08480475097894669, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4346 + }, + { + "epoch": 3.4720447284345046, + "grad_norm": 0.07331875711679459, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4347 + }, + { + "epoch": 3.472843450479233, + "grad_norm": 0.0505477711558342, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4348 + }, + { + "epoch": 3.4736421725239617, + "grad_norm": 0.06969176232814789, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4349 + }, + { + "epoch": 3.47444089456869, + "grad_norm": 0.08915391564369202, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4350 + }, + { + "epoch": 3.4752396166134183, + "grad_norm": 0.09378752112388611, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4351 + }, + { + "epoch": 3.476038338658147, + "grad_norm": 0.059195373207330704, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4352 + }, + { + "epoch": 3.4768370607028753, + "grad_norm": 0.07094884663820267, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 4353 + }, + { + "epoch": 3.477635782747604, + "grad_norm": 0.11091995984315872, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4354 + }, + { + "epoch": 3.4784345047923324, + "grad_norm": 0.14018885791301727, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 4355 + }, + { + "epoch": 3.479233226837061, + "grad_norm": 0.13553708791732788, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4356 + }, + { + "epoch": 3.480031948881789, + "grad_norm": 0.08005240559577942, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4357 + }, + { + "epoch": 3.4808306709265175, + "grad_norm": 0.05309261009097099, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 4358 + }, + { + "epoch": 3.481629392971246, + "grad_norm": 0.09956394135951996, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4359 + }, + { + "epoch": 3.4824281150159746, + "grad_norm": 0.13189470767974854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4360 + }, + { + "epoch": 3.4832268370607027, + "grad_norm": 0.13651393353939056, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4361 + }, + { + "epoch": 3.484025559105431, + "grad_norm": 0.12467528879642487, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4362 + }, + { + "epoch": 3.4848242811501597, + "grad_norm": 0.11428561061620712, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4363 + }, + { + "epoch": 3.4856230031948883, + "grad_norm": 0.12095288187265396, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4364 + }, + { + "epoch": 3.486421725239617, + "grad_norm": 0.05889631807804108, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 4365 + }, + { + "epoch": 3.487220447284345, + "grad_norm": 0.1158040463924408, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4366 + }, + { + "epoch": 3.4880191693290734, + "grad_norm": 0.11070148646831512, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4367 + }, + { + "epoch": 3.488817891373802, + "grad_norm": 0.0625298023223877, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4368 + }, + { + "epoch": 3.4896166134185305, + "grad_norm": 0.11865562945604324, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4369 + }, + { + "epoch": 3.4904153354632586, + "grad_norm": 0.12237154692411423, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4370 + }, + { + "epoch": 3.491214057507987, + "grad_norm": 0.05703050270676613, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4371 + }, + { + "epoch": 3.4920127795527156, + "grad_norm": 0.17314022779464722, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4372 + }, + { + "epoch": 3.492811501597444, + "grad_norm": 0.2984711825847626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4373 + }, + { + "epoch": 3.4936102236421727, + "grad_norm": 0.30129608511924744, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4374 + }, + { + "epoch": 3.494408945686901, + "grad_norm": 0.12154170870780945, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4375 + }, + { + "epoch": 3.4952076677316293, + "grad_norm": 0.12467148154973984, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4376 + }, + { + "epoch": 3.496006389776358, + "grad_norm": 0.23285721242427826, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4377 + }, + { + "epoch": 3.4968051118210863, + "grad_norm": 0.20723310112953186, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4378 + }, + { + "epoch": 3.497603833865815, + "grad_norm": 0.13221028447151184, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4379 + }, + { + "epoch": 3.498402555910543, + "grad_norm": 0.06008061394095421, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 4380 + }, + { + "epoch": 3.4992012779552715, + "grad_norm": 0.12877988815307617, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4381 + }, + { + "epoch": 3.5, + "grad_norm": 0.1951032429933548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4382 + }, + { + "epoch": 3.5007987220447285, + "grad_norm": 0.13804258406162262, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4383 + }, + { + "epoch": 3.501597444089457, + "grad_norm": 0.06761720031499863, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 4384 + }, + { + "epoch": 3.502396166134185, + "grad_norm": 0.13217084109783173, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 4385 + }, + { + "epoch": 3.5031948881789137, + "grad_norm": 0.11773377656936646, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4386 + }, + { + "epoch": 3.503993610223642, + "grad_norm": 0.07580399513244629, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 4387 + }, + { + "epoch": 3.5047923322683707, + "grad_norm": 0.1739586442708969, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4388 + }, + { + "epoch": 3.505591054313099, + "grad_norm": 0.14863203465938568, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 4389 + }, + { + "epoch": 3.5063897763578273, + "grad_norm": 0.07858511805534363, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4390 + }, + { + "epoch": 3.507188498402556, + "grad_norm": 0.15966418385505676, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4391 + }, + { + "epoch": 3.5079872204472844, + "grad_norm": 0.28761810064315796, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 4392 + }, + { + "epoch": 3.508785942492013, + "grad_norm": 0.24169668555259705, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4393 + }, + { + "epoch": 3.5095846645367414, + "grad_norm": 0.07907059788703918, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4394 + }, + { + "epoch": 3.5103833865814695, + "grad_norm": 0.20243291556835175, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 4395 + }, + { + "epoch": 3.511182108626198, + "grad_norm": 0.302198588848114, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4396 + }, + { + "epoch": 3.5119808306709266, + "grad_norm": 0.2544843554496765, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 4397 + }, + { + "epoch": 3.512779552715655, + "grad_norm": 0.07381684333086014, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4398 + }, + { + "epoch": 3.513578274760383, + "grad_norm": 0.17388348281383514, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 4399 + }, + { + "epoch": 3.5143769968051117, + "grad_norm": 0.2293306440114975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4400 + }, + { + "epoch": 3.5151757188498403, + "grad_norm": 0.07548263669013977, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4401 + }, + { + "epoch": 3.515974440894569, + "grad_norm": 0.1924273669719696, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4402 + }, + { + "epoch": 3.5167731629392973, + "grad_norm": 0.26867300271987915, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4403 + }, + { + "epoch": 3.5175718849840254, + "grad_norm": 0.14461541175842285, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4404 + }, + { + "epoch": 3.518370607028754, + "grad_norm": 0.12608370184898376, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4405 + }, + { + "epoch": 3.5191693290734825, + "grad_norm": 0.20579756796360016, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4406 + }, + { + "epoch": 3.519968051118211, + "grad_norm": 0.12286399304866791, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4407 + }, + { + "epoch": 3.520766773162939, + "grad_norm": 0.055247388780117035, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 4408 + }, + { + "epoch": 3.5215654952076676, + "grad_norm": 0.07877562195062637, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4409 + }, + { + "epoch": 3.522364217252396, + "grad_norm": 0.0769568607211113, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4410 + }, + { + "epoch": 3.5231629392971247, + "grad_norm": 0.0898609384894371, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4411 + }, + { + "epoch": 3.523961661341853, + "grad_norm": 0.057637594640254974, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4412 + }, + { + "epoch": 3.5247603833865817, + "grad_norm": 0.12046241015195847, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4413 + }, + { + "epoch": 3.52555910543131, + "grad_norm": 0.09949496388435364, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 4414 + }, + { + "epoch": 3.5263578274760383, + "grad_norm": 0.054411277174949646, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4415 + }, + { + "epoch": 3.527156549520767, + "grad_norm": 0.08293551951646805, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4416 + }, + { + "epoch": 3.527955271565495, + "grad_norm": 0.07669435441493988, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4417 + }, + { + "epoch": 3.5287539936102235, + "grad_norm": 0.06382326781749725, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4418 + }, + { + "epoch": 3.529552715654952, + "grad_norm": 0.07673322409391403, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4419 + }, + { + "epoch": 3.5303514376996805, + "grad_norm": 0.08052650839090347, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4420 + }, + { + "epoch": 3.531150159744409, + "grad_norm": 0.1354246884584427, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4421 + }, + { + "epoch": 3.5319488817891376, + "grad_norm": 0.07951574772596359, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 4422 + }, + { + "epoch": 3.5327476038338657, + "grad_norm": 0.11002526432275772, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 4423 + }, + { + "epoch": 3.533546325878594, + "grad_norm": 0.18597234785556793, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4424 + }, + { + "epoch": 3.5343450479233227, + "grad_norm": 0.12601099908351898, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 4425 + }, + { + "epoch": 3.5351437699680512, + "grad_norm": 0.11181886494159698, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4426 + }, + { + "epoch": 3.5359424920127793, + "grad_norm": 0.11489108949899673, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4427 + }, + { + "epoch": 3.536741214057508, + "grad_norm": 0.10422708839178085, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4428 + }, + { + "epoch": 3.5375399361022364, + "grad_norm": 0.13701972365379333, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4429 + }, + { + "epoch": 3.538338658146965, + "grad_norm": 0.10713281482458115, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4430 + }, + { + "epoch": 3.5391373801916934, + "grad_norm": 0.11508526653051376, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4431 + }, + { + "epoch": 3.539936102236422, + "grad_norm": 0.061856236308813095, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4432 + }, + { + "epoch": 3.54073482428115, + "grad_norm": 0.12080623209476471, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4433 + }, + { + "epoch": 3.5415335463258786, + "grad_norm": 0.12233573198318481, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4434 + }, + { + "epoch": 3.542332268370607, + "grad_norm": 0.07041362673044205, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4435 + }, + { + "epoch": 3.543130990415335, + "grad_norm": 0.1162526085972786, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4436 + }, + { + "epoch": 3.5439297124600637, + "grad_norm": 0.12962234020233154, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4437 + }, + { + "epoch": 3.5447284345047922, + "grad_norm": 0.1368536353111267, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4438 + }, + { + "epoch": 3.5455271565495208, + "grad_norm": 0.061806995421648026, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4439 + }, + { + "epoch": 3.5463258785942493, + "grad_norm": 0.11016163975000381, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4440 + }, + { + "epoch": 3.547124600638978, + "grad_norm": 0.0992715135216713, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4441 + }, + { + "epoch": 3.547923322683706, + "grad_norm": 0.14015190303325653, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4442 + }, + { + "epoch": 3.5487220447284344, + "grad_norm": 0.07255455106496811, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 4443 + }, + { + "epoch": 3.549520766773163, + "grad_norm": 0.13293872773647308, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4444 + }, + { + "epoch": 3.5503194888178915, + "grad_norm": 0.08923539519309998, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4445 + }, + { + "epoch": 3.5511182108626196, + "grad_norm": 0.10125918686389923, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4446 + }, + { + "epoch": 3.551916932907348, + "grad_norm": 0.12369748950004578, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4447 + }, + { + "epoch": 3.5527156549520766, + "grad_norm": 0.14656996726989746, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4448 + }, + { + "epoch": 3.553514376996805, + "grad_norm": 0.14212539792060852, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4449 + }, + { + "epoch": 3.5543130990415337, + "grad_norm": 0.08640166372060776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4450 + }, + { + "epoch": 3.5551118210862622, + "grad_norm": 0.05552735924720764, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4451 + }, + { + "epoch": 3.5559105431309903, + "grad_norm": 0.12888140976428986, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 4452 + }, + { + "epoch": 3.556709265175719, + "grad_norm": 0.10696940869092941, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 4453 + }, + { + "epoch": 3.5575079872204474, + "grad_norm": 0.06578963249921799, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 4454 + }, + { + "epoch": 3.5583067092651754, + "grad_norm": 0.16173291206359863, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4455 + }, + { + "epoch": 3.559105431309904, + "grad_norm": 0.1550486832857132, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4456 + }, + { + "epoch": 3.5599041533546325, + "grad_norm": 0.14084209501743317, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4457 + }, + { + "epoch": 3.560702875399361, + "grad_norm": 0.12024512141942978, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4458 + }, + { + "epoch": 3.5615015974440896, + "grad_norm": 0.12514936923980713, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4459 + }, + { + "epoch": 3.562300319488818, + "grad_norm": 0.16444219648838043, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4460 + }, + { + "epoch": 3.563099041533546, + "grad_norm": 0.11520830541849136, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4461 + }, + { + "epoch": 3.5638977635782747, + "grad_norm": 0.07884586602449417, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4462 + }, + { + "epoch": 3.5646964856230032, + "grad_norm": 0.1655684858560562, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4463 + }, + { + "epoch": 3.5654952076677318, + "grad_norm": 0.15222500264644623, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 4464 + }, + { + "epoch": 3.56629392971246, + "grad_norm": 0.06106618419289589, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 4465 + }, + { + "epoch": 3.5670926517571884, + "grad_norm": 0.10545333474874496, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4466 + }, + { + "epoch": 3.567891373801917, + "grad_norm": 0.1353088915348053, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4467 + }, + { + "epoch": 3.5686900958466454, + "grad_norm": 0.11200091242790222, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4468 + }, + { + "epoch": 3.569488817891374, + "grad_norm": 0.052965741604566574, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4469 + }, + { + "epoch": 3.5702875399361025, + "grad_norm": 0.1244843453168869, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4470 + }, + { + "epoch": 3.5710862619808306, + "grad_norm": 0.1160016730427742, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4471 + }, + { + "epoch": 3.571884984025559, + "grad_norm": 0.04874402657151222, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4472 + }, + { + "epoch": 3.5726837060702876, + "grad_norm": 0.14222301542758942, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4473 + }, + { + "epoch": 3.5734824281150157, + "grad_norm": 0.1190859004855156, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4474 + }, + { + "epoch": 3.5742811501597442, + "grad_norm": 0.0659632682800293, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4475 + }, + { + "epoch": 3.5750798722044728, + "grad_norm": 0.07350483536720276, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4476 + }, + { + "epoch": 3.5758785942492013, + "grad_norm": 0.1220504492521286, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4477 + }, + { + "epoch": 3.57667731629393, + "grad_norm": 0.08952966332435608, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4478 + }, + { + "epoch": 3.5774760383386583, + "grad_norm": 0.08828000724315643, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4479 + }, + { + "epoch": 3.5782747603833864, + "grad_norm": 0.14621564745903015, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4480 + }, + { + "epoch": 3.579073482428115, + "grad_norm": 0.13653770089149475, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4481 + }, + { + "epoch": 3.5798722044728435, + "grad_norm": 0.0682564228773117, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4482 + }, + { + "epoch": 3.580670926517572, + "grad_norm": 0.06511309742927551, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4483 + }, + { + "epoch": 3.5814696485623, + "grad_norm": 0.08800239861011505, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4484 + }, + { + "epoch": 3.5822683706070286, + "grad_norm": 0.06488335877656937, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4485 + }, + { + "epoch": 3.583067092651757, + "grad_norm": 0.06505738198757172, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4486 + }, + { + "epoch": 3.5838658146964857, + "grad_norm": 0.07395542412996292, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4487 + }, + { + "epoch": 3.584664536741214, + "grad_norm": 0.06717971712350845, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4488 + }, + { + "epoch": 3.5854632587859427, + "grad_norm": 0.056708067655563354, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4489 + }, + { + "epoch": 3.586261980830671, + "grad_norm": 0.06316737830638885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4490 + }, + { + "epoch": 3.5870607028753994, + "grad_norm": 0.06079665198922157, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4491 + }, + { + "epoch": 3.587859424920128, + "grad_norm": 0.1293981820344925, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4492 + }, + { + "epoch": 3.588658146964856, + "grad_norm": 0.08021418750286102, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4493 + }, + { + "epoch": 3.5894568690095845, + "grad_norm": 0.096865214407444, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4494 + }, + { + "epoch": 3.590255591054313, + "grad_norm": 0.06794966757297516, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4495 + }, + { + "epoch": 3.5910543130990416, + "grad_norm": 0.04527222737669945, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4496 + }, + { + "epoch": 3.59185303514377, + "grad_norm": 0.07153941690921783, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4497 + }, + { + "epoch": 3.5926517571884986, + "grad_norm": 0.07480445504188538, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4498 + }, + { + "epoch": 3.5934504792332267, + "grad_norm": 0.09161835163831711, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4499 + }, + { + "epoch": 3.594249201277955, + "grad_norm": 0.08420681953430176, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 4500 + }, + { + "epoch": 3.5950479233226837, + "grad_norm": 0.04745415225625038, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4501 + }, + { + "epoch": 3.5958466453674123, + "grad_norm": 0.061325494199991226, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4502 + }, + { + "epoch": 3.5966453674121404, + "grad_norm": 0.08550430834293365, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 4503 + }, + { + "epoch": 3.597444089456869, + "grad_norm": 0.09530419111251831, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4504 + }, + { + "epoch": 3.5982428115015974, + "grad_norm": 0.10484769195318222, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4505 + }, + { + "epoch": 3.599041533546326, + "grad_norm": 0.08398665487766266, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4506 + }, + { + "epoch": 3.5998402555910545, + "grad_norm": 0.1644149124622345, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4507 + }, + { + "epoch": 3.600638977635783, + "grad_norm": 0.0803244560956955, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4508 + }, + { + "epoch": 3.601437699680511, + "grad_norm": 0.12512895464897156, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4509 + }, + { + "epoch": 3.6022364217252396, + "grad_norm": 0.1404576301574707, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4510 + }, + { + "epoch": 3.603035143769968, + "grad_norm": 0.10823316127061844, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4511 + }, + { + "epoch": 3.6038338658146962, + "grad_norm": 0.06985688954591751, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4512 + }, + { + "epoch": 3.6046325878594248, + "grad_norm": 0.1651264876127243, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4513 + }, + { + "epoch": 3.6054313099041533, + "grad_norm": 0.19752484560012817, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4514 + }, + { + "epoch": 3.606230031948882, + "grad_norm": 0.20005464553833008, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4515 + }, + { + "epoch": 3.6070287539936103, + "grad_norm": 0.1478145569562912, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4516 + }, + { + "epoch": 3.607827476038339, + "grad_norm": 0.05737901106476784, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4517 + }, + { + "epoch": 3.608626198083067, + "grad_norm": 0.16174650192260742, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4518 + }, + { + "epoch": 3.6094249201277955, + "grad_norm": 0.1959141194820404, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4519 + }, + { + "epoch": 3.610223642172524, + "grad_norm": 0.09767267853021622, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4520 + }, + { + "epoch": 3.6110223642172525, + "grad_norm": 0.10553760081529617, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4521 + }, + { + "epoch": 3.6118210862619806, + "grad_norm": 0.19380977749824524, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4522 + }, + { + "epoch": 3.612619808306709, + "grad_norm": 0.2024526745080948, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4523 + }, + { + "epoch": 3.6134185303514377, + "grad_norm": 0.09705837070941925, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 4524 + }, + { + "epoch": 3.614217252396166, + "grad_norm": 0.12530986964702606, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4525 + }, + { + "epoch": 3.6150159744408947, + "grad_norm": 0.20901283621788025, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4526 + }, + { + "epoch": 3.6158146964856233, + "grad_norm": 0.16532309353351593, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4527 + }, + { + "epoch": 3.6166134185303513, + "grad_norm": 0.18353991210460663, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4528 + }, + { + "epoch": 3.61741214057508, + "grad_norm": 0.12912365794181824, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4529 + }, + { + "epoch": 3.6182108626198084, + "grad_norm": 0.2052653580904007, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4530 + }, + { + "epoch": 3.6190095846645365, + "grad_norm": 0.1395503133535385, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4531 + }, + { + "epoch": 3.619808306709265, + "grad_norm": 0.07939961552619934, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4532 + }, + { + "epoch": 3.6206070287539935, + "grad_norm": 0.10098318755626678, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4533 + }, + { + "epoch": 3.621405750798722, + "grad_norm": 0.14332561194896698, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4534 + }, + { + "epoch": 3.6222044728434506, + "grad_norm": 0.09697199612855911, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4535 + }, + { + "epoch": 3.623003194888179, + "grad_norm": 0.07785658538341522, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4536 + }, + { + "epoch": 3.623801916932907, + "grad_norm": 0.11263108998537064, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4537 + }, + { + "epoch": 3.6246006389776357, + "grad_norm": 0.18257030844688416, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4538 + }, + { + "epoch": 3.6253993610223643, + "grad_norm": 0.1456373631954193, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4539 + }, + { + "epoch": 3.626198083067093, + "grad_norm": 0.06831679493188858, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4540 + }, + { + "epoch": 3.626996805111821, + "grad_norm": 0.12324535846710205, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4541 + }, + { + "epoch": 3.6277955271565494, + "grad_norm": 0.15868282318115234, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4542 + }, + { + "epoch": 3.628594249201278, + "grad_norm": 0.09355167299509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4543 + }, + { + "epoch": 3.6293929712460065, + "grad_norm": 0.08047328144311905, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4544 + }, + { + "epoch": 3.630191693290735, + "grad_norm": 0.12683328986167908, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4545 + }, + { + "epoch": 3.6309904153354635, + "grad_norm": 0.11964920908212662, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4546 + }, + { + "epoch": 3.6317891373801916, + "grad_norm": 0.0504109226167202, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 4547 + }, + { + "epoch": 3.63258785942492, + "grad_norm": 0.11909852921962738, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 4548 + }, + { + "epoch": 3.6333865814696487, + "grad_norm": 0.16763992607593536, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 4549 + }, + { + "epoch": 3.6341853035143767, + "grad_norm": 0.1486649513244629, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4550 + }, + { + "epoch": 3.6349840255591053, + "grad_norm": 0.06941305845975876, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4551 + }, + { + "epoch": 3.635782747603834, + "grad_norm": 0.1177566722035408, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4552 + }, + { + "epoch": 3.6365814696485623, + "grad_norm": 0.23368601500988007, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4553 + }, + { + "epoch": 3.637380191693291, + "grad_norm": 0.24657249450683594, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4554 + }, + { + "epoch": 3.6381789137380194, + "grad_norm": 0.10063605010509491, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4555 + }, + { + "epoch": 3.6389776357827475, + "grad_norm": 0.1553603708744049, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4556 + }, + { + "epoch": 3.639776357827476, + "grad_norm": 0.25588107109069824, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4557 + }, + { + "epoch": 3.6405750798722045, + "grad_norm": 0.15270236134529114, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4558 + }, + { + "epoch": 3.641373801916933, + "grad_norm": 0.108666330575943, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4559 + }, + { + "epoch": 3.642172523961661, + "grad_norm": 0.19828133285045624, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4560 + }, + { + "epoch": 3.6429712460063897, + "grad_norm": 0.21500051021575928, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4561 + }, + { + "epoch": 3.643769968051118, + "grad_norm": 0.16299934685230255, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4562 + }, + { + "epoch": 3.6445686900958467, + "grad_norm": 0.07390763610601425, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4563 + }, + { + "epoch": 3.6453674121405752, + "grad_norm": 0.22709119319915771, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4564 + }, + { + "epoch": 3.6461661341853038, + "grad_norm": 0.15557943284511566, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4565 + }, + { + "epoch": 3.646964856230032, + "grad_norm": 0.062457580119371414, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4566 + }, + { + "epoch": 3.6477635782747604, + "grad_norm": 0.09101095795631409, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4567 + }, + { + "epoch": 3.648562300319489, + "grad_norm": 0.08700825273990631, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 4568 + }, + { + "epoch": 3.649361022364217, + "grad_norm": 0.058703795075416565, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4569 + }, + { + "epoch": 3.6501597444089455, + "grad_norm": 0.056776538491249084, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4570 + }, + { + "epoch": 3.650958466453674, + "grad_norm": 0.062245409935712814, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4571 + }, + { + "epoch": 3.6517571884984026, + "grad_norm": 0.0534074492752552, + "learning_rate": 0.0005, + "loss": 1.0423, + "step": 4572 + }, + { + "epoch": 3.652555910543131, + "grad_norm": 0.09061384946107864, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4573 + }, + { + "epoch": 3.6533546325878596, + "grad_norm": 0.07323598116636276, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 4574 + }, + { + "epoch": 3.6541533546325877, + "grad_norm": 0.1120329350233078, + "learning_rate": 0.0005, + "loss": 1.0414, + "step": 4575 + }, + { + "epoch": 3.6549520766773163, + "grad_norm": 0.07965485006570816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 4576 + }, + { + "epoch": 3.655750798722045, + "grad_norm": 0.06320462375879288, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4577 + }, + { + "epoch": 3.6565495207667733, + "grad_norm": 0.07869421690702438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4578 + }, + { + "epoch": 3.6573482428115014, + "grad_norm": 0.09003151208162308, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 4579 + }, + { + "epoch": 3.65814696485623, + "grad_norm": 0.05570388212800026, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4580 + }, + { + "epoch": 3.6589456869009584, + "grad_norm": 0.15563733875751495, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4581 + }, + { + "epoch": 3.659744408945687, + "grad_norm": 0.1422414481639862, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4582 + }, + { + "epoch": 3.6605431309904155, + "grad_norm": 0.13704177737236023, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 4583 + }, + { + "epoch": 3.661341853035144, + "grad_norm": 0.36126458644866943, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4584 + }, + { + "epoch": 3.662140575079872, + "grad_norm": 0.09024632722139359, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4585 + }, + { + "epoch": 3.6629392971246006, + "grad_norm": 0.07135412096977234, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4586 + }, + { + "epoch": 3.663738019169329, + "grad_norm": 0.06172417849302292, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4587 + }, + { + "epoch": 3.6645367412140573, + "grad_norm": 0.05962595343589783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4588 + }, + { + "epoch": 3.665335463258786, + "grad_norm": 0.07063078880310059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4589 + }, + { + "epoch": 3.6661341853035143, + "grad_norm": 0.1445596069097519, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4590 + }, + { + "epoch": 3.666932907348243, + "grad_norm": 0.09224060922861099, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4591 + }, + { + "epoch": 3.6677316293929714, + "grad_norm": 0.10353037714958191, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4592 + }, + { + "epoch": 3.6685303514377, + "grad_norm": 0.10922796279191971, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4593 + }, + { + "epoch": 3.669329073482428, + "grad_norm": 0.08728764951229095, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4594 + }, + { + "epoch": 3.6701277955271565, + "grad_norm": 0.0639081671833992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4595 + }, + { + "epoch": 3.670926517571885, + "grad_norm": 0.050491299480199814, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4596 + }, + { + "epoch": 3.6717252396166136, + "grad_norm": 0.07127548009157181, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4597 + }, + { + "epoch": 3.6725239616613417, + "grad_norm": 0.05432606860995293, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4598 + }, + { + "epoch": 3.67332268370607, + "grad_norm": 0.0653342455625534, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4599 + }, + { + "epoch": 3.6741214057507987, + "grad_norm": 0.08766797184944153, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 4600 + }, + { + "epoch": 3.6749201277955272, + "grad_norm": 0.0816602036356926, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4601 + }, + { + "epoch": 3.6757188498402558, + "grad_norm": 0.08774783462285995, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4602 + }, + { + "epoch": 3.6765175718849843, + "grad_norm": 0.07776570320129395, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4603 + }, + { + "epoch": 3.6773162939297124, + "grad_norm": 0.07067213952541351, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4604 + }, + { + "epoch": 3.678115015974441, + "grad_norm": 0.06581863760948181, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4605 + }, + { + "epoch": 3.6789137380191694, + "grad_norm": 0.08631278574466705, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4606 + }, + { + "epoch": 3.6797124600638975, + "grad_norm": 0.10875384509563446, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4607 + }, + { + "epoch": 3.680511182108626, + "grad_norm": 0.11207764595746994, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4608 + }, + { + "epoch": 3.6813099041533546, + "grad_norm": 0.08943730592727661, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4609 + }, + { + "epoch": 3.682108626198083, + "grad_norm": 0.1922001987695694, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4610 + }, + { + "epoch": 3.6829073482428116, + "grad_norm": 0.10121189057826996, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4611 + }, + { + "epoch": 3.68370607028754, + "grad_norm": 0.05991055443882942, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4612 + }, + { + "epoch": 3.6845047923322682, + "grad_norm": 0.0897853821516037, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4613 + }, + { + "epoch": 3.6853035143769968, + "grad_norm": 0.13160353899002075, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4614 + }, + { + "epoch": 3.6861022364217253, + "grad_norm": 0.13855913281440735, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4615 + }, + { + "epoch": 3.686900958466454, + "grad_norm": 0.11086787283420563, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4616 + }, + { + "epoch": 3.687699680511182, + "grad_norm": 0.07992085069417953, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4617 + }, + { + "epoch": 3.6884984025559104, + "grad_norm": 0.11618958413600922, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4618 + }, + { + "epoch": 3.689297124600639, + "grad_norm": 0.19551296532154083, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4619 + }, + { + "epoch": 3.6900958466453675, + "grad_norm": 0.20239807665348053, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 4620 + }, + { + "epoch": 3.690894568690096, + "grad_norm": 0.13233833014965057, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4621 + }, + { + "epoch": 3.6916932907348246, + "grad_norm": 0.08789848536252975, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4622 + }, + { + "epoch": 3.6924920127795526, + "grad_norm": 0.2363075315952301, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4623 + }, + { + "epoch": 3.693290734824281, + "grad_norm": 0.2585245668888092, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4624 + }, + { + "epoch": 3.6940894568690097, + "grad_norm": 0.15822109580039978, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4625 + }, + { + "epoch": 3.6948881789137378, + "grad_norm": 0.07197296619415283, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4626 + }, + { + "epoch": 3.6956869009584663, + "grad_norm": 0.21067900955677032, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4627 + }, + { + "epoch": 3.696485623003195, + "grad_norm": 0.19520802795886993, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4628 + }, + { + "epoch": 3.6972843450479234, + "grad_norm": 0.08310793340206146, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4629 + }, + { + "epoch": 3.698083067092652, + "grad_norm": 0.2118932604789734, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 4630 + }, + { + "epoch": 3.6988817891373804, + "grad_norm": 0.2236505001783371, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4631 + }, + { + "epoch": 3.6996805111821085, + "grad_norm": 0.16256077587604523, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4632 + }, + { + "epoch": 3.700479233226837, + "grad_norm": 0.14406970143318176, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4633 + }, + { + "epoch": 3.7012779552715656, + "grad_norm": 0.09738676995038986, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 4634 + }, + { + "epoch": 3.702076677316294, + "grad_norm": 0.07531408965587616, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4635 + }, + { + "epoch": 3.702875399361022, + "grad_norm": 0.11631188541650772, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 4636 + }, + { + "epoch": 3.7036741214057507, + "grad_norm": 0.11661874502897263, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4637 + }, + { + "epoch": 3.7044728434504792, + "grad_norm": 0.11709950119256973, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 4638 + }, + { + "epoch": 3.7052715654952078, + "grad_norm": 0.13420704007148743, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4639 + }, + { + "epoch": 3.7060702875399363, + "grad_norm": 0.08842958509922028, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4640 + }, + { + "epoch": 3.706869009584665, + "grad_norm": 0.07295326143503189, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4641 + }, + { + "epoch": 3.707667731629393, + "grad_norm": 0.14573390781879425, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4642 + }, + { + "epoch": 3.7084664536741214, + "grad_norm": 0.06639868766069412, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 4643 + }, + { + "epoch": 3.70926517571885, + "grad_norm": 0.05936001241207123, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4644 + }, + { + "epoch": 3.710063897763578, + "grad_norm": 0.06534209847450256, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 4645 + }, + { + "epoch": 3.7108626198083066, + "grad_norm": 0.13101834058761597, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4646 + }, + { + "epoch": 3.711661341853035, + "grad_norm": 0.07707498222589493, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4647 + }, + { + "epoch": 3.7124600638977636, + "grad_norm": 0.09272165596485138, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4648 + }, + { + "epoch": 3.713258785942492, + "grad_norm": 0.12538838386535645, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4649 + }, + { + "epoch": 3.7140575079872207, + "grad_norm": 0.10816318541765213, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4650 + }, + { + "epoch": 3.7148562300319488, + "grad_norm": 0.10610290616750717, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4651 + }, + { + "epoch": 3.7156549520766773, + "grad_norm": 0.09520592540502548, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4652 + }, + { + "epoch": 3.716453674121406, + "grad_norm": 0.05595150217413902, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4653 + }, + { + "epoch": 3.7172523961661343, + "grad_norm": 0.08114545047283173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4654 + }, + { + "epoch": 3.7180511182108624, + "grad_norm": 0.16090086102485657, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4655 + }, + { + "epoch": 3.718849840255591, + "grad_norm": 0.16332058608531952, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4656 + }, + { + "epoch": 3.7196485623003195, + "grad_norm": 0.17694437503814697, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4657 + }, + { + "epoch": 3.720447284345048, + "grad_norm": 0.16341771185398102, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 4658 + }, + { + "epoch": 3.7212460063897765, + "grad_norm": 0.12268038839101791, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4659 + }, + { + "epoch": 3.722044728434505, + "grad_norm": 0.09971031546592712, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4660 + }, + { + "epoch": 3.722843450479233, + "grad_norm": 0.08546486496925354, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4661 + }, + { + "epoch": 3.7236421725239617, + "grad_norm": 0.15427617728710175, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 4662 + }, + { + "epoch": 3.72444089456869, + "grad_norm": 0.1291000247001648, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4663 + }, + { + "epoch": 3.7252396166134183, + "grad_norm": 0.06823746860027313, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4664 + }, + { + "epoch": 3.726038338658147, + "grad_norm": 0.08133388310670853, + "learning_rate": 0.0005, + "loss": 1.0453, + "step": 4665 + }, + { + "epoch": 3.7268370607028753, + "grad_norm": 0.08803416788578033, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4666 + }, + { + "epoch": 3.727635782747604, + "grad_norm": 0.05898858234286308, + "learning_rate": 0.0005, + "loss": 1.0366, + "step": 4667 + }, + { + "epoch": 3.7284345047923324, + "grad_norm": 0.07650687545537949, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 4668 + }, + { + "epoch": 3.729233226837061, + "grad_norm": 0.15048138797283173, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4669 + }, + { + "epoch": 3.730031948881789, + "grad_norm": 0.08594254404306412, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4670 + }, + { + "epoch": 3.7308306709265175, + "grad_norm": 0.05322937294840813, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4671 + }, + { + "epoch": 3.731629392971246, + "grad_norm": 0.14541727304458618, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4672 + }, + { + "epoch": 3.7324281150159746, + "grad_norm": 0.10300826281309128, + "learning_rate": 0.0005, + "loss": 1.0455, + "step": 4673 + }, + { + "epoch": 3.7332268370607027, + "grad_norm": 0.05903324484825134, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 4674 + }, + { + "epoch": 3.734025559105431, + "grad_norm": 0.07101032137870789, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4675 + }, + { + "epoch": 3.7348242811501597, + "grad_norm": 0.09166763722896576, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4676 + }, + { + "epoch": 3.7356230031948883, + "grad_norm": 0.06929054856300354, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4677 + }, + { + "epoch": 3.736421725239617, + "grad_norm": 0.05935844033956528, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4678 + }, + { + "epoch": 3.737220447284345, + "grad_norm": 0.09101571142673492, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4679 + }, + { + "epoch": 3.7380191693290734, + "grad_norm": 0.0979514792561531, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4680 + }, + { + "epoch": 3.738817891373802, + "grad_norm": 0.07105522602796555, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4681 + }, + { + "epoch": 3.7396166134185305, + "grad_norm": 0.05741708725690842, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4682 + }, + { + "epoch": 3.7404153354632586, + "grad_norm": 0.051515400409698486, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4683 + }, + { + "epoch": 3.741214057507987, + "grad_norm": 0.06484496593475342, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4684 + }, + { + "epoch": 3.7420127795527156, + "grad_norm": 0.056751761585474014, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4685 + }, + { + "epoch": 3.742811501597444, + "grad_norm": 0.09628041833639145, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4686 + }, + { + "epoch": 3.7436102236421727, + "grad_norm": 0.13367851078510284, + "learning_rate": 0.0005, + "loss": 1.0421, + "step": 4687 + }, + { + "epoch": 3.744408945686901, + "grad_norm": 0.10439570248126984, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 4688 + }, + { + "epoch": 3.7452076677316293, + "grad_norm": 0.05516012758016586, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 4689 + }, + { + "epoch": 3.746006389776358, + "grad_norm": 0.0721910372376442, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 4690 + }, + { + "epoch": 3.7468051118210863, + "grad_norm": 0.10327166318893433, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4691 + }, + { + "epoch": 3.747603833865815, + "grad_norm": 0.10419414937496185, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4692 + }, + { + "epoch": 3.748402555910543, + "grad_norm": 0.07322157919406891, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4693 + }, + { + "epoch": 3.7492012779552715, + "grad_norm": 0.05000368133187294, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4694 + }, + { + "epoch": 3.75, + "grad_norm": 0.055239707231521606, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4695 + }, + { + "epoch": 3.7507987220447285, + "grad_norm": 0.14060117304325104, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 4696 + }, + { + "epoch": 3.751597444089457, + "grad_norm": 0.1366022527217865, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4697 + }, + { + "epoch": 3.752396166134185, + "grad_norm": 0.15003731846809387, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4698 + }, + { + "epoch": 3.7531948881789137, + "grad_norm": 0.11602472513914108, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4699 + }, + { + "epoch": 3.753993610223642, + "grad_norm": 0.06956090778112411, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4700 + }, + { + "epoch": 3.7547923322683707, + "grad_norm": 0.04711974412202835, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4701 + }, + { + "epoch": 3.755591054313099, + "grad_norm": 0.09257466346025467, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4702 + }, + { + "epoch": 3.7563897763578273, + "grad_norm": 0.06598426401615143, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4703 + }, + { + "epoch": 3.757188498402556, + "grad_norm": 0.06239036098122597, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4704 + }, + { + "epoch": 3.7579872204472844, + "grad_norm": 0.10065969824790955, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 4705 + }, + { + "epoch": 3.758785942492013, + "grad_norm": 0.12874993681907654, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4706 + }, + { + "epoch": 3.7595846645367414, + "grad_norm": 0.10291960090398788, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 4707 + }, + { + "epoch": 3.7603833865814695, + "grad_norm": 0.06138000637292862, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4708 + }, + { + "epoch": 3.761182108626198, + "grad_norm": 0.11565262079238892, + "learning_rate": 0.0005, + "loss": 1.0441, + "step": 4709 + }, + { + "epoch": 3.7619808306709266, + "grad_norm": 0.08041521906852722, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 4710 + }, + { + "epoch": 3.762779552715655, + "grad_norm": 0.07228218764066696, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4711 + }, + { + "epoch": 3.763578274760383, + "grad_norm": 0.09155906736850739, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4712 + }, + { + "epoch": 3.7643769968051117, + "grad_norm": 0.07468429207801819, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 4713 + }, + { + "epoch": 3.7651757188498403, + "grad_norm": 0.07629574090242386, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 4714 + }, + { + "epoch": 3.765974440894569, + "grad_norm": 0.1118689477443695, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 4715 + }, + { + "epoch": 3.7667731629392973, + "grad_norm": 0.07983580976724625, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4716 + }, + { + "epoch": 3.7675718849840254, + "grad_norm": 0.07225694507360458, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4717 + }, + { + "epoch": 3.768370607028754, + "grad_norm": 0.1322079598903656, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 4718 + }, + { + "epoch": 3.7691693290734825, + "grad_norm": 0.17217211425304413, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4719 + }, + { + "epoch": 3.769968051118211, + "grad_norm": 0.14665336906909943, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4720 + }, + { + "epoch": 3.770766773162939, + "grad_norm": 0.09977035969495773, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4721 + }, + { + "epoch": 3.7715654952076676, + "grad_norm": 0.1346946358680725, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4722 + }, + { + "epoch": 3.772364217252396, + "grad_norm": 0.17330871522426605, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 4723 + }, + { + "epoch": 3.7731629392971247, + "grad_norm": 0.17789506912231445, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 4724 + }, + { + "epoch": 3.773961661341853, + "grad_norm": 0.06285518407821655, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4725 + }, + { + "epoch": 3.7747603833865817, + "grad_norm": 0.13192926347255707, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4726 + }, + { + "epoch": 3.77555910543131, + "grad_norm": 0.12157132476568222, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4727 + }, + { + "epoch": 3.7763578274760383, + "grad_norm": 0.1203337088227272, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4728 + }, + { + "epoch": 3.777156549520767, + "grad_norm": 0.16711866855621338, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4729 + }, + { + "epoch": 3.777955271565495, + "grad_norm": 0.13596504926681519, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4730 + }, + { + "epoch": 3.7787539936102235, + "grad_norm": 0.13502761721611023, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4731 + }, + { + "epoch": 3.779552715654952, + "grad_norm": 0.0751141607761383, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 4732 + }, + { + "epoch": 3.7803514376996805, + "grad_norm": 0.1104620099067688, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4733 + }, + { + "epoch": 3.781150159744409, + "grad_norm": 0.06397949904203415, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 4734 + }, + { + "epoch": 3.7819488817891376, + "grad_norm": 0.07850230485200882, + "learning_rate": 0.0005, + "loss": 1.038, + "step": 4735 + }, + { + "epoch": 3.7827476038338657, + "grad_norm": 0.10330549627542496, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4736 + }, + { + "epoch": 3.783546325878594, + "grad_norm": 0.08978938311338425, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4737 + }, + { + "epoch": 3.7843450479233227, + "grad_norm": 0.07073058933019638, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4738 + }, + { + "epoch": 3.7851437699680512, + "grad_norm": 0.05997786670923233, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 4739 + }, + { + "epoch": 3.7859424920127793, + "grad_norm": 0.0779404565691948, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 4740 + }, + { + "epoch": 3.786741214057508, + "grad_norm": 0.1367640644311905, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 4741 + }, + { + "epoch": 3.7875399361022364, + "grad_norm": 0.08670534938573837, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4742 + }, + { + "epoch": 3.788338658146965, + "grad_norm": 0.08612547069787979, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4743 + }, + { + "epoch": 3.7891373801916934, + "grad_norm": 0.06312929093837738, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4744 + }, + { + "epoch": 3.789936102236422, + "grad_norm": 0.06397293508052826, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 4745 + }, + { + "epoch": 3.79073482428115, + "grad_norm": 0.0663115605711937, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4746 + }, + { + "epoch": 3.7915335463258786, + "grad_norm": 0.07580576092004776, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4747 + }, + { + "epoch": 3.792332268370607, + "grad_norm": 0.12604761123657227, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 4748 + }, + { + "epoch": 3.793130990415335, + "grad_norm": 0.08900050073862076, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4749 + }, + { + "epoch": 3.7939297124600637, + "grad_norm": 0.09280730038881302, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4750 + }, + { + "epoch": 3.7947284345047922, + "grad_norm": 0.17689163982868195, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 4751 + }, + { + "epoch": 3.7955271565495208, + "grad_norm": 0.06348183006048203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4752 + }, + { + "epoch": 3.7963258785942493, + "grad_norm": 0.12626387178897858, + "learning_rate": 0.0005, + "loss": 1.0387, + "step": 4753 + }, + { + "epoch": 3.797124600638978, + "grad_norm": 0.1138390377163887, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4754 + }, + { + "epoch": 3.797923322683706, + "grad_norm": 0.08058728277683258, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4755 + }, + { + "epoch": 3.7987220447284344, + "grad_norm": 0.09671882539987564, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 4756 + }, + { + "epoch": 3.799520766773163, + "grad_norm": 0.12193922698497772, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4757 + }, + { + "epoch": 3.8003194888178915, + "grad_norm": 0.31105268001556396, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4758 + }, + { + "epoch": 3.8011182108626196, + "grad_norm": 0.10482051223516464, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 4759 + }, + { + "epoch": 3.801916932907348, + "grad_norm": 0.09116382896900177, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4760 + }, + { + "epoch": 3.8027156549520766, + "grad_norm": 0.08212421089410782, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4761 + }, + { + "epoch": 3.803514376996805, + "grad_norm": 0.08267461508512497, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4762 + }, + { + "epoch": 3.8043130990415337, + "grad_norm": 0.13247907161712646, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4763 + }, + { + "epoch": 3.8051118210862622, + "grad_norm": 0.1083490327000618, + "learning_rate": 0.0005, + "loss": 1.0409, + "step": 4764 + }, + { + "epoch": 3.8059105431309903, + "grad_norm": 0.11947019398212433, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 4765 + }, + { + "epoch": 3.806709265175719, + "grad_norm": 0.08462221175432205, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 4766 + }, + { + "epoch": 3.8075079872204474, + "grad_norm": 0.07244928181171417, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 4767 + }, + { + "epoch": 3.8083067092651754, + "grad_norm": 0.13432611525058746, + "learning_rate": 0.0005, + "loss": 1.0442, + "step": 4768 + }, + { + "epoch": 3.809105431309904, + "grad_norm": 0.16640888154506683, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4769 + }, + { + "epoch": 3.8099041533546325, + "grad_norm": 0.12189232558012009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4770 + }, + { + "epoch": 3.810702875399361, + "grad_norm": 0.052367180585861206, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4771 + }, + { + "epoch": 3.8115015974440896, + "grad_norm": 0.10426424443721771, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4772 + }, + { + "epoch": 3.812300319488818, + "grad_norm": 0.11365417391061783, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4773 + }, + { + "epoch": 3.813099041533546, + "grad_norm": 0.07064168155193329, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4774 + }, + { + "epoch": 3.8138977635782747, + "grad_norm": 0.2107549011707306, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4775 + }, + { + "epoch": 3.8146964856230032, + "grad_norm": 0.2984449565410614, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4776 + }, + { + "epoch": 3.8154952076677318, + "grad_norm": 0.26252058148384094, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4777 + }, + { + "epoch": 3.81629392971246, + "grad_norm": 0.08128907531499863, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 4778 + }, + { + "epoch": 3.8170926517571884, + "grad_norm": 0.2724008858203888, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4779 + }, + { + "epoch": 3.817891373801917, + "grad_norm": 0.2646482288837433, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 4780 + }, + { + "epoch": 3.8186900958466454, + "grad_norm": 0.16063876450061798, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4781 + }, + { + "epoch": 3.819488817891374, + "grad_norm": 0.11671862006187439, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4782 + }, + { + "epoch": 3.8202875399361025, + "grad_norm": 0.21605245769023895, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4783 + }, + { + "epoch": 3.8210862619808306, + "grad_norm": 0.17344583570957184, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4784 + }, + { + "epoch": 3.821884984025559, + "grad_norm": 0.08113347738981247, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4785 + }, + { + "epoch": 3.8226837060702876, + "grad_norm": 0.11774581670761108, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 4786 + }, + { + "epoch": 3.8234824281150157, + "grad_norm": 0.2024560272693634, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4787 + }, + { + "epoch": 3.8242811501597442, + "grad_norm": 0.5578162670135498, + "learning_rate": 0.0005, + "loss": 1.0382, + "step": 4788 + }, + { + "epoch": 3.8250798722044728, + "grad_norm": 0.10354574024677277, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4789 + }, + { + "epoch": 3.8258785942492013, + "grad_norm": 0.14583979547023773, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4790 + }, + { + "epoch": 3.82667731629393, + "grad_norm": 0.15853755176067352, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4791 + }, + { + "epoch": 3.8274760383386583, + "grad_norm": 0.1308104395866394, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4792 + }, + { + "epoch": 3.8282747603833864, + "grad_norm": 0.04385368898510933, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4793 + }, + { + "epoch": 3.829073482428115, + "grad_norm": 0.16213825345039368, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4794 + }, + { + "epoch": 3.8298722044728435, + "grad_norm": 0.2693546414375305, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 4795 + }, + { + "epoch": 3.830670926517572, + "grad_norm": 0.23904170095920563, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4796 + }, + { + "epoch": 3.8314696485623, + "grad_norm": 0.11313450336456299, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4797 + }, + { + "epoch": 3.8322683706070286, + "grad_norm": 0.0770820751786232, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4798 + }, + { + "epoch": 3.833067092651757, + "grad_norm": 0.8537606596946716, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 4799 + }, + { + "epoch": 3.8338658146964857, + "grad_norm": 0.13684043288230896, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 4800 + }, + { + "epoch": 3.834664536741214, + "grad_norm": 0.0890694409608841, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 4801 + }, + { + "epoch": 3.8354632587859427, + "grad_norm": 0.060917336493730545, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 4802 + }, + { + "epoch": 3.836261980830671, + "grad_norm": 0.13864673674106598, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4803 + }, + { + "epoch": 3.8370607028753994, + "grad_norm": 0.15316139161586761, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4804 + }, + { + "epoch": 3.837859424920128, + "grad_norm": 0.061508018523454666, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4805 + }, + { + "epoch": 3.838658146964856, + "grad_norm": 0.126112699508667, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 4806 + }, + { + "epoch": 3.8394568690095845, + "grad_norm": 0.1663133054971695, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4807 + }, + { + "epoch": 3.840255591054313, + "grad_norm": 0.14435894787311554, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4808 + }, + { + "epoch": 3.8410543130990416, + "grad_norm": 0.06042332574725151, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4809 + }, + { + "epoch": 3.84185303514377, + "grad_norm": 0.12759631872177124, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 4810 + }, + { + "epoch": 3.8426517571884986, + "grad_norm": 0.18153302371501923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4811 + }, + { + "epoch": 3.8434504792332267, + "grad_norm": 0.1280708760023117, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 4812 + }, + { + "epoch": 3.844249201277955, + "grad_norm": 0.07144157588481903, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4813 + }, + { + "epoch": 3.8450479233226837, + "grad_norm": 0.13078796863555908, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 4814 + }, + { + "epoch": 3.8458466453674123, + "grad_norm": 0.16230762004852295, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 4815 + }, + { + "epoch": 3.8466453674121404, + "grad_norm": 0.10997766256332397, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 4816 + }, + { + "epoch": 3.847444089456869, + "grad_norm": 0.06006971001625061, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4817 + }, + { + "epoch": 3.8482428115015974, + "grad_norm": 0.10155797749757767, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 4818 + }, + { + "epoch": 3.849041533546326, + "grad_norm": 0.11125919967889786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 4819 + }, + { + "epoch": 3.8498402555910545, + "grad_norm": 0.0860416367650032, + "learning_rate": 0.0005, + "loss": 1.0415, + "step": 4820 + }, + { + "epoch": 3.850638977635783, + "grad_norm": 0.0862870067358017, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 4821 + }, + { + "epoch": 3.851437699680511, + "grad_norm": 0.07229744642972946, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4822 + }, + { + "epoch": 3.8522364217252396, + "grad_norm": 0.10448424518108368, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 4823 + }, + { + "epoch": 3.853035143769968, + "grad_norm": 0.08971705287694931, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 4824 + }, + { + "epoch": 3.8538338658146962, + "grad_norm": 0.09876695275306702, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4825 + }, + { + "epoch": 3.8546325878594248, + "grad_norm": 0.0667971819639206, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 4826 + }, + { + "epoch": 3.8554313099041533, + "grad_norm": 0.14437620341777802, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 4827 + }, + { + "epoch": 3.856230031948882, + "grad_norm": 0.17627735435962677, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4828 + }, + { + "epoch": 3.8570287539936103, + "grad_norm": 0.10524439066648483, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4829 + }, + { + "epoch": 3.857827476038339, + "grad_norm": 0.15091893076896667, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4830 + }, + { + "epoch": 3.858626198083067, + "grad_norm": 0.22534102201461792, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4831 + }, + { + "epoch": 3.8594249201277955, + "grad_norm": 0.08298768103122711, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4832 + }, + { + "epoch": 3.860223642172524, + "grad_norm": 0.16647395491600037, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 4833 + }, + { + "epoch": 3.8610223642172525, + "grad_norm": 0.22512534260749817, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 4834 + }, + { + "epoch": 3.8618210862619806, + "grad_norm": 0.2130710482597351, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4835 + }, + { + "epoch": 3.862619808306709, + "grad_norm": 0.1250864863395691, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4836 + }, + { + "epoch": 3.8634185303514377, + "grad_norm": 0.13937048614025116, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4837 + }, + { + "epoch": 3.864217252396166, + "grad_norm": 0.19059741497039795, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4838 + }, + { + "epoch": 3.8650159744408947, + "grad_norm": 0.22080829739570618, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4839 + }, + { + "epoch": 3.8658146964856233, + "grad_norm": 0.09463749825954437, + "learning_rate": 0.0005, + "loss": 1.0418, + "step": 4840 + }, + { + "epoch": 3.8666134185303513, + "grad_norm": 0.16431698203086853, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4841 + }, + { + "epoch": 3.86741214057508, + "grad_norm": 0.2162260264158249, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 4842 + }, + { + "epoch": 3.8682108626198084, + "grad_norm": 0.0789603665471077, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4843 + }, + { + "epoch": 3.8690095846645365, + "grad_norm": 0.18372099101543427, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 4844 + }, + { + "epoch": 3.869808306709265, + "grad_norm": 0.24845194816589355, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 4845 + }, + { + "epoch": 3.8706070287539935, + "grad_norm": 0.22064632177352905, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4846 + }, + { + "epoch": 3.871405750798722, + "grad_norm": 0.0718264952301979, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4847 + }, + { + "epoch": 3.8722044728434506, + "grad_norm": 0.2048031985759735, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4848 + }, + { + "epoch": 3.873003194888179, + "grad_norm": 0.23190200328826904, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 4849 + }, + { + "epoch": 3.873801916932907, + "grad_norm": 0.06851150840520859, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 4850 + }, + { + "epoch": 3.8746006389776357, + "grad_norm": 0.2371164858341217, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4851 + }, + { + "epoch": 3.8753993610223643, + "grad_norm": 0.23518243432044983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4852 + }, + { + "epoch": 3.876198083067093, + "grad_norm": 0.08026961237192154, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4853 + }, + { + "epoch": 3.876996805111821, + "grad_norm": 0.1623634397983551, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4854 + }, + { + "epoch": 3.8777955271565494, + "grad_norm": 0.21676453948020935, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4855 + }, + { + "epoch": 3.878594249201278, + "grad_norm": 0.07868681848049164, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 4856 + }, + { + "epoch": 3.8793929712460065, + "grad_norm": 0.18302997946739197, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4857 + }, + { + "epoch": 3.880191693290735, + "grad_norm": 0.2338407188653946, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 4858 + }, + { + "epoch": 3.8809904153354635, + "grad_norm": 0.2534898817539215, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4859 + }, + { + "epoch": 3.8817891373801916, + "grad_norm": 0.19988521933555603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4860 + }, + { + "epoch": 3.88258785942492, + "grad_norm": 0.2896076440811157, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4861 + }, + { + "epoch": 3.8833865814696487, + "grad_norm": 0.1088651567697525, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4862 + }, + { + "epoch": 3.8841853035143767, + "grad_norm": 0.18549342453479767, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4863 + }, + { + "epoch": 3.8849840255591053, + "grad_norm": 0.24760019779205322, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 4864 + }, + { + "epoch": 3.885782747603834, + "grad_norm": 0.1323750913143158, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 4865 + }, + { + "epoch": 3.8865814696485623, + "grad_norm": 0.14235283434391022, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 4866 + }, + { + "epoch": 3.887380191693291, + "grad_norm": 0.20409083366394043, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 4867 + }, + { + "epoch": 3.8881789137380194, + "grad_norm": 0.1743297129869461, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4868 + }, + { + "epoch": 3.8889776357827475, + "grad_norm": 0.09692966938018799, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 4869 + }, + { + "epoch": 3.889776357827476, + "grad_norm": 0.09934467077255249, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4870 + }, + { + "epoch": 3.8905750798722045, + "grad_norm": 0.2410827875137329, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4871 + }, + { + "epoch": 3.891373801916933, + "grad_norm": 0.27096229791641235, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4872 + }, + { + "epoch": 3.892172523961661, + "grad_norm": 0.09133906662464142, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4873 + }, + { + "epoch": 3.8929712460063897, + "grad_norm": 0.20275604724884033, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 4874 + }, + { + "epoch": 3.893769968051118, + "grad_norm": 0.19578030705451965, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4875 + }, + { + "epoch": 3.8945686900958467, + "grad_norm": 0.12888970971107483, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 4876 + }, + { + "epoch": 3.8953674121405752, + "grad_norm": 0.10301528871059418, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4877 + }, + { + "epoch": 3.8961661341853038, + "grad_norm": 0.1635914444923401, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 4878 + }, + { + "epoch": 3.896964856230032, + "grad_norm": 0.1971803456544876, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 4879 + }, + { + "epoch": 3.8977635782747604, + "grad_norm": 0.1085273027420044, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 4880 + }, + { + "epoch": 3.898562300319489, + "grad_norm": 0.07375707477331161, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 4881 + }, + { + "epoch": 3.899361022364217, + "grad_norm": 0.5828747153282166, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 4882 + }, + { + "epoch": 3.9001597444089455, + "grad_norm": 0.10320120304822922, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4883 + }, + { + "epoch": 3.900958466453674, + "grad_norm": 0.10118676722049713, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4884 + }, + { + "epoch": 3.9017571884984026, + "grad_norm": 0.22034543752670288, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4885 + }, + { + "epoch": 3.902555910543131, + "grad_norm": 0.21823646128177643, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4886 + }, + { + "epoch": 3.9033546325878596, + "grad_norm": 0.14776065945625305, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4887 + }, + { + "epoch": 3.9041533546325877, + "grad_norm": 0.13297663629055023, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 4888 + }, + { + "epoch": 3.9049520766773163, + "grad_norm": 0.4447253942489624, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4889 + }, + { + "epoch": 3.905750798722045, + "grad_norm": 0.171112522482872, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 4890 + }, + { + "epoch": 3.9065495207667733, + "grad_norm": 0.1581616848707199, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 4891 + }, + { + "epoch": 3.9073482428115014, + "grad_norm": 0.18396562337875366, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4892 + }, + { + "epoch": 3.90814696485623, + "grad_norm": 0.15952393412590027, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 4893 + }, + { + "epoch": 3.9089456869009584, + "grad_norm": 0.12889564037322998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 4894 + }, + { + "epoch": 3.909744408945687, + "grad_norm": 0.130104660987854, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 4895 + }, + { + "epoch": 3.9105431309904155, + "grad_norm": 0.13011464476585388, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4896 + }, + { + "epoch": 3.911341853035144, + "grad_norm": 0.06485363095998764, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 4897 + }, + { + "epoch": 3.912140575079872, + "grad_norm": 0.11353932321071625, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4898 + }, + { + "epoch": 3.9129392971246006, + "grad_norm": 0.13279879093170166, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4899 + }, + { + "epoch": 3.913738019169329, + "grad_norm": 0.19181469082832336, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 4900 + }, + { + "epoch": 3.9145367412140573, + "grad_norm": 0.06930892914533615, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 4901 + }, + { + "epoch": 3.915335463258786, + "grad_norm": 0.10591714829206467, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 4902 + }, + { + "epoch": 3.9161341853035143, + "grad_norm": 0.09693296998739243, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 4903 + }, + { + "epoch": 3.916932907348243, + "grad_norm": 0.1604270488023758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 4904 + }, + { + "epoch": 3.9177316293929714, + "grad_norm": 0.19874586164951324, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4905 + }, + { + "epoch": 3.9185303514377, + "grad_norm": 0.09015987068414688, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 4906 + }, + { + "epoch": 3.919329073482428, + "grad_norm": 0.09864864498376846, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4907 + }, + { + "epoch": 3.9201277955271565, + "grad_norm": 0.12509673833847046, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4908 + }, + { + "epoch": 3.920926517571885, + "grad_norm": 0.10216362774372101, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 4909 + }, + { + "epoch": 3.9217252396166136, + "grad_norm": 0.11854741722345352, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4910 + }, + { + "epoch": 3.9225239616613417, + "grad_norm": 0.08570919930934906, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 4911 + }, + { + "epoch": 3.92332268370607, + "grad_norm": 0.095781609416008, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4912 + }, + { + "epoch": 3.9241214057507987, + "grad_norm": 0.05698491260409355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4913 + }, + { + "epoch": 3.9249201277955272, + "grad_norm": 0.09786297380924225, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 4914 + }, + { + "epoch": 3.9257188498402558, + "grad_norm": 0.1206512302160263, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4915 + }, + { + "epoch": 3.9265175718849843, + "grad_norm": 0.07593982666730881, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 4916 + }, + { + "epoch": 3.9273162939297124, + "grad_norm": 0.06973730027675629, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 4917 + }, + { + "epoch": 3.928115015974441, + "grad_norm": 0.07377546280622482, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 4918 + }, + { + "epoch": 3.9289137380191694, + "grad_norm": 0.06871537119150162, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4919 + }, + { + "epoch": 3.9297124600638975, + "grad_norm": 0.09697525203227997, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4920 + }, + { + "epoch": 3.930511182108626, + "grad_norm": 0.07418478280305862, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4921 + }, + { + "epoch": 3.9313099041533546, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 4922 + }, + { + "epoch": 3.932108626198083, + "grad_norm": 0.08099815994501114, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4923 + }, + { + "epoch": 3.9329073482428116, + "grad_norm": 0.08033913373947144, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4924 + }, + { + "epoch": 3.93370607028754, + "grad_norm": 0.1089775413274765, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4925 + }, + { + "epoch": 3.9345047923322682, + "grad_norm": 0.06866748631000519, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 4926 + }, + { + "epoch": 3.9353035143769968, + "grad_norm": 0.12346489727497101, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 4927 + }, + { + "epoch": 3.9361022364217253, + "grad_norm": 0.1388891190290451, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4928 + }, + { + "epoch": 3.936900958466454, + "grad_norm": 0.12678411602973938, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4929 + }, + { + "epoch": 3.937699680511182, + "grad_norm": 0.08638305962085724, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 4930 + }, + { + "epoch": 3.9384984025559104, + "grad_norm": 0.667020320892334, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 4931 + }, + { + "epoch": 3.939297124600639, + "grad_norm": 0.0867542177438736, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 4932 + }, + { + "epoch": 3.9400958466453675, + "grad_norm": 0.1075657457113266, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4933 + }, + { + "epoch": 3.940894568690096, + "grad_norm": 0.10359356552362442, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 4934 + }, + { + "epoch": 3.9416932907348246, + "grad_norm": 0.04861772805452347, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 4935 + }, + { + "epoch": 3.9424920127795526, + "grad_norm": 0.08871651440858841, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 4936 + }, + { + "epoch": 3.943290734824281, + "grad_norm": 0.05268944799900055, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 4937 + }, + { + "epoch": 3.9440894568690097, + "grad_norm": 0.11428069323301315, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4938 + }, + { + "epoch": 3.9448881789137378, + "grad_norm": 0.1302616149187088, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 4939 + }, + { + "epoch": 3.9456869009584663, + "grad_norm": 0.09091098606586456, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4940 + }, + { + "epoch": 3.946485623003195, + "grad_norm": 0.23224923014640808, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 4941 + }, + { + "epoch": 3.9472843450479234, + "grad_norm": 0.13427230715751648, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 4942 + }, + { + "epoch": 3.948083067092652, + "grad_norm": 0.24157744646072388, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 4943 + }, + { + "epoch": 3.9488817891373804, + "grad_norm": 0.15497569739818573, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 4944 + }, + { + "epoch": 3.9496805111821085, + "grad_norm": 0.15587151050567627, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 4945 + }, + { + "epoch": 3.950479233226837, + "grad_norm": 0.0827038437128067, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 4946 + }, + { + "epoch": 3.9512779552715656, + "grad_norm": 0.17405007779598236, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4947 + }, + { + "epoch": 3.952076677316294, + "grad_norm": 0.1612532138824463, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 4948 + }, + { + "epoch": 3.952875399361022, + "grad_norm": 0.07505665719509125, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4949 + }, + { + "epoch": 3.9536741214057507, + "grad_norm": 0.07138567417860031, + "learning_rate": 0.0005, + "loss": 1.0433, + "step": 4950 + }, + { + "epoch": 3.9544728434504792, + "grad_norm": 0.09206511080265045, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 4951 + }, + { + "epoch": 3.9552715654952078, + "grad_norm": 0.09190725535154343, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4952 + }, + { + "epoch": 3.9560702875399363, + "grad_norm": 0.13024544715881348, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 4953 + }, + { + "epoch": 3.956869009584665, + "grad_norm": 0.08161026239395142, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 4954 + }, + { + "epoch": 3.957667731629393, + "grad_norm": 0.17207187414169312, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4955 + }, + { + "epoch": 3.9584664536741214, + "grad_norm": 0.096051886677742, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 4956 + }, + { + "epoch": 3.95926517571885, + "grad_norm": 0.11038299649953842, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4957 + }, + { + "epoch": 3.960063897763578, + "grad_norm": 0.09957583248615265, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 4958 + }, + { + "epoch": 3.9608626198083066, + "grad_norm": 0.06923667341470718, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 4959 + }, + { + "epoch": 3.961661341853035, + "grad_norm": 0.07572069019079208, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 4960 + }, + { + "epoch": 3.9624600638977636, + "grad_norm": 0.16801652312278748, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 4961 + }, + { + "epoch": 3.963258785942492, + "grad_norm": 0.062117498368024826, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 4962 + }, + { + "epoch": 3.9640575079872207, + "grad_norm": 0.08293396979570389, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 4963 + }, + { + "epoch": 3.9648562300319488, + "grad_norm": 0.2021675407886505, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 4964 + }, + { + "epoch": 3.9656549520766773, + "grad_norm": 0.10666973143815994, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 4965 + }, + { + "epoch": 3.966453674121406, + "grad_norm": 0.09226572513580322, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4966 + }, + { + "epoch": 3.9672523961661343, + "grad_norm": 0.10113741457462311, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4967 + }, + { + "epoch": 3.9680511182108624, + "grad_norm": 0.10156626254320145, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 4968 + }, + { + "epoch": 3.968849840255591, + "grad_norm": 0.08531442284584045, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 4969 + }, + { + "epoch": 3.9696485623003195, + "grad_norm": 0.08894761651754379, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 4970 + }, + { + "epoch": 3.970447284345048, + "grad_norm": 0.07934322953224182, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 4971 + }, + { + "epoch": 3.9712460063897765, + "grad_norm": 0.07121701538562775, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 4972 + }, + { + "epoch": 3.972044728434505, + "grad_norm": 0.09110251814126968, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 4973 + }, + { + "epoch": 3.972843450479233, + "grad_norm": 0.09724952280521393, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4974 + }, + { + "epoch": 3.9736421725239617, + "grad_norm": 0.08619683235883713, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 4975 + }, + { + "epoch": 3.97444089456869, + "grad_norm": 0.14789989590644836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 4976 + }, + { + "epoch": 3.9752396166134183, + "grad_norm": 0.08736634254455566, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 4977 + }, + { + "epoch": 3.976038338658147, + "grad_norm": 0.2260635793209076, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 4978 + }, + { + "epoch": 3.9768370607028753, + "grad_norm": 0.2150910496711731, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 4979 + }, + { + "epoch": 3.977635782747604, + "grad_norm": 0.12071242183446884, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 4980 + }, + { + "epoch": 3.9784345047923324, + "grad_norm": 0.11614276468753815, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 4981 + }, + { + "epoch": 3.979233226837061, + "grad_norm": 0.0954839214682579, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 4982 + }, + { + "epoch": 3.980031948881789, + "grad_norm": 0.09801400452852249, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4983 + }, + { + "epoch": 3.9808306709265175, + "grad_norm": 0.07435343414545059, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 4984 + }, + { + "epoch": 3.981629392971246, + "grad_norm": 0.09401766955852509, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 4985 + }, + { + "epoch": 3.9824281150159746, + "grad_norm": 0.09850753843784332, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 4986 + }, + { + "epoch": 3.9832268370607027, + "grad_norm": 0.07880235463380814, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 4987 + }, + { + "epoch": 3.984025559105431, + "grad_norm": 0.08208848536014557, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 4988 + }, + { + "epoch": 3.9848242811501597, + "grad_norm": 0.10432668030261993, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4989 + }, + { + "epoch": 3.9856230031948883, + "grad_norm": 0.05202944204211235, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 4990 + }, + { + "epoch": 3.986421725239617, + "grad_norm": 0.0831860601902008, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4991 + }, + { + "epoch": 3.987220447284345, + "grad_norm": 0.1084689050912857, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 4992 + }, + { + "epoch": 3.9880191693290734, + "grad_norm": 0.1095893383026123, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 4993 + }, + { + "epoch": 3.988817891373802, + "grad_norm": 0.24480414390563965, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 4994 + }, + { + "epoch": 3.9896166134185305, + "grad_norm": 0.11939835548400879, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 4995 + }, + { + "epoch": 3.9904153354632586, + "grad_norm": 0.0829034298658371, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 4996 + }, + { + "epoch": 3.991214057507987, + "grad_norm": 0.1649356484413147, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 4997 + }, + { + "epoch": 3.9920127795527156, + "grad_norm": 0.18428824841976166, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 4998 + }, + { + "epoch": 3.992811501597444, + "grad_norm": 0.14441022276878357, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 4999 + }, + { + "epoch": 3.9936102236421727, + "grad_norm": 0.1025838553905487, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5000 + }, + { + "epoch": 3.994408945686901, + "grad_norm": 0.18659353256225586, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5001 + }, + { + "epoch": 3.9952076677316293, + "grad_norm": 0.18462489545345306, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 5002 + }, + { + "epoch": 3.996006389776358, + "grad_norm": 0.11221570521593094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5003 + }, + { + "epoch": 3.9968051118210863, + "grad_norm": 0.1611207127571106, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5004 + }, + { + "epoch": 3.997603833865815, + "grad_norm": 0.10003258287906647, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5005 + }, + { + "epoch": 3.998402555910543, + "grad_norm": 0.06686410307884216, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5006 + }, + { + "epoch": 3.9992012779552715, + "grad_norm": 0.07527180016040802, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5007 + }, + { + "epoch": 4.0, + "grad_norm": 0.11602520197629929, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5008 + }, + { + "epoch": 4.0007987220447285, + "grad_norm": 0.04460546746850014, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5009 + }, + { + "epoch": 4.001597444089457, + "grad_norm": 1.1286108493804932, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5010 + }, + { + "epoch": 4.002396166134186, + "grad_norm": 0.12730571627616882, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5011 + }, + { + "epoch": 4.003194888178914, + "grad_norm": 0.060798924416303635, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5012 + }, + { + "epoch": 4.003993610223642, + "grad_norm": 0.11491188406944275, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5013 + }, + { + "epoch": 4.00479233226837, + "grad_norm": 0.09877663850784302, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5014 + }, + { + "epoch": 4.005591054313099, + "grad_norm": 0.06991511583328247, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5015 + }, + { + "epoch": 4.006389776357827, + "grad_norm": 0.05524459481239319, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 5016 + }, + { + "epoch": 4.007188498402556, + "grad_norm": 0.07421471178531647, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5017 + }, + { + "epoch": 4.007987220447284, + "grad_norm": 0.10918284207582474, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5018 + }, + { + "epoch": 4.008785942492013, + "grad_norm": 0.42926761507987976, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5019 + }, + { + "epoch": 4.0095846645367414, + "grad_norm": 0.12511351704597473, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5020 + }, + { + "epoch": 4.01038338658147, + "grad_norm": 0.0985826924443245, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5021 + }, + { + "epoch": 4.0111821086261985, + "grad_norm": 0.10876046866178513, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5022 + }, + { + "epoch": 4.011980830670926, + "grad_norm": 0.0973401740193367, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 5023 + }, + { + "epoch": 4.012779552715655, + "grad_norm": 0.10867046564817429, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5024 + }, + { + "epoch": 4.013578274760383, + "grad_norm": 0.16030259430408478, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5025 + }, + { + "epoch": 4.014376996805112, + "grad_norm": 0.09972470998764038, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5026 + }, + { + "epoch": 4.01517571884984, + "grad_norm": 0.06945701688528061, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5027 + }, + { + "epoch": 4.015974440894569, + "grad_norm": 0.12256570160388947, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5028 + }, + { + "epoch": 4.016773162939297, + "grad_norm": 0.1318589597940445, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5029 + }, + { + "epoch": 4.017571884984026, + "grad_norm": 0.14831772446632385, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5030 + }, + { + "epoch": 4.018370607028754, + "grad_norm": 0.12650129199028015, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5031 + }, + { + "epoch": 4.019169329073482, + "grad_norm": 0.25457820296287537, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5032 + }, + { + "epoch": 4.0199680511182105, + "grad_norm": 0.10183271020650864, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5033 + }, + { + "epoch": 4.020766773162939, + "grad_norm": 0.14198726415634155, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5034 + }, + { + "epoch": 4.021565495207668, + "grad_norm": 0.1551627218723297, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5035 + }, + { + "epoch": 4.022364217252396, + "grad_norm": 0.29212328791618347, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5036 + }, + { + "epoch": 4.023162939297125, + "grad_norm": 0.25203290581703186, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5037 + }, + { + "epoch": 4.023961661341853, + "grad_norm": 0.12793950736522675, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5038 + }, + { + "epoch": 4.024760383386582, + "grad_norm": 0.10916420817375183, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5039 + }, + { + "epoch": 4.02555910543131, + "grad_norm": 0.09980735182762146, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5040 + }, + { + "epoch": 4.026357827476039, + "grad_norm": 0.1633901745080948, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5041 + }, + { + "epoch": 4.027156549520766, + "grad_norm": 0.10058299452066422, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5042 + }, + { + "epoch": 4.027955271565495, + "grad_norm": 0.08121561259031296, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5043 + }, + { + "epoch": 4.0287539936102235, + "grad_norm": 0.19947005808353424, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5044 + }, + { + "epoch": 4.029552715654952, + "grad_norm": 0.24219068884849548, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5045 + }, + { + "epoch": 4.0303514376996805, + "grad_norm": 0.28928735852241516, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5046 + }, + { + "epoch": 4.031150159744409, + "grad_norm": 0.062404267489910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5047 + }, + { + "epoch": 4.031948881789138, + "grad_norm": 0.1607569456100464, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5048 + }, + { + "epoch": 4.032747603833866, + "grad_norm": 0.14420244097709656, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5049 + }, + { + "epoch": 4.033546325878595, + "grad_norm": 0.838013768196106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5050 + }, + { + "epoch": 4.034345047923322, + "grad_norm": 0.15198078751564026, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5051 + }, + { + "epoch": 4.035143769968051, + "grad_norm": 0.18439999222755432, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5052 + }, + { + "epoch": 4.035942492012779, + "grad_norm": 0.1283460259437561, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5053 + }, + { + "epoch": 4.036741214057508, + "grad_norm": 0.07285412400960922, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5054 + }, + { + "epoch": 4.037539936102236, + "grad_norm": 0.21856451034545898, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5055 + }, + { + "epoch": 4.038338658146965, + "grad_norm": 0.1934041529893875, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5056 + }, + { + "epoch": 4.039137380191693, + "grad_norm": 0.07998216152191162, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5057 + }, + { + "epoch": 4.039936102236422, + "grad_norm": 0.2202988713979721, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5058 + }, + { + "epoch": 4.0407348242811505, + "grad_norm": 0.22000271081924438, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5059 + }, + { + "epoch": 4.041533546325879, + "grad_norm": 0.06229308247566223, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5060 + }, + { + "epoch": 4.042332268370607, + "grad_norm": 0.19611188769340515, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5061 + }, + { + "epoch": 4.043130990415335, + "grad_norm": 0.2385999858379364, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5062 + }, + { + "epoch": 4.043929712460064, + "grad_norm": 0.06504995375871658, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5063 + }, + { + "epoch": 4.044728434504792, + "grad_norm": 0.17860567569732666, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5064 + }, + { + "epoch": 4.045527156549521, + "grad_norm": 0.17580853402614594, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5065 + }, + { + "epoch": 4.046325878594249, + "grad_norm": 0.06523217260837555, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5066 + }, + { + "epoch": 4.047124600638978, + "grad_norm": 0.2795565128326416, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5067 + }, + { + "epoch": 4.047923322683706, + "grad_norm": 0.289105623960495, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5068 + }, + { + "epoch": 4.048722044728435, + "grad_norm": 0.07829197496175766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5069 + }, + { + "epoch": 4.0495207667731625, + "grad_norm": 0.24165435135364532, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5070 + }, + { + "epoch": 4.050319488817891, + "grad_norm": 0.2785094976425171, + "learning_rate": 0.0005, + "loss": 1.0461, + "step": 5071 + }, + { + "epoch": 4.05111821086262, + "grad_norm": 0.08929550647735596, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5072 + }, + { + "epoch": 4.051916932907348, + "grad_norm": 0.24677781760692596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5073 + }, + { + "epoch": 4.052715654952077, + "grad_norm": 0.25207674503326416, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5074 + }, + { + "epoch": 4.053514376996805, + "grad_norm": 0.06409729272127151, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5075 + }, + { + "epoch": 4.054313099041534, + "grad_norm": 0.2670205235481262, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5076 + }, + { + "epoch": 4.055111821086262, + "grad_norm": 0.1854943484067917, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5077 + }, + { + "epoch": 4.055910543130991, + "grad_norm": 0.1409354954957962, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5078 + }, + { + "epoch": 4.056709265175719, + "grad_norm": 0.24084609746932983, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5079 + }, + { + "epoch": 4.057507987220447, + "grad_norm": 0.16520382463932037, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5080 + }, + { + "epoch": 4.0583067092651754, + "grad_norm": 0.11086967587471008, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5081 + }, + { + "epoch": 4.059105431309904, + "grad_norm": 0.15748612582683563, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5082 + }, + { + "epoch": 4.0599041533546325, + "grad_norm": 0.1196034848690033, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5083 + }, + { + "epoch": 4.060702875399361, + "grad_norm": 0.06799823045730591, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5084 + }, + { + "epoch": 4.06150159744409, + "grad_norm": 0.1223025768995285, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5085 + }, + { + "epoch": 4.062300319488818, + "grad_norm": 0.04760991781949997, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5086 + }, + { + "epoch": 4.063099041533547, + "grad_norm": 0.11782078444957733, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5087 + }, + { + "epoch": 4.063897763578275, + "grad_norm": 0.13057227432727814, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5088 + }, + { + "epoch": 4.064696485623003, + "grad_norm": 0.0719611644744873, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5089 + }, + { + "epoch": 4.065495207667731, + "grad_norm": 0.13513247668743134, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5090 + }, + { + "epoch": 4.06629392971246, + "grad_norm": 0.14960692822933197, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5091 + }, + { + "epoch": 4.067092651757188, + "grad_norm": 0.06219497323036194, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5092 + }, + { + "epoch": 4.067891373801917, + "grad_norm": 0.06755383312702179, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5093 + }, + { + "epoch": 4.068690095846645, + "grad_norm": 0.08237830549478531, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5094 + }, + { + "epoch": 4.069488817891374, + "grad_norm": 0.0915946289896965, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5095 + }, + { + "epoch": 4.0702875399361025, + "grad_norm": 0.06893479824066162, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5096 + }, + { + "epoch": 4.071086261980831, + "grad_norm": 0.04133071005344391, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5097 + }, + { + "epoch": 4.0718849840255595, + "grad_norm": 0.062333185225725174, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5098 + }, + { + "epoch": 4.072683706070287, + "grad_norm": 0.05741016939282417, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5099 + }, + { + "epoch": 4.073482428115016, + "grad_norm": 0.04988866671919823, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5100 + }, + { + "epoch": 4.074281150159744, + "grad_norm": 0.050187818706035614, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5101 + }, + { + "epoch": 4.075079872204473, + "grad_norm": 0.08479643613100052, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5102 + }, + { + "epoch": 4.075878594249201, + "grad_norm": 0.13840351998806, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5103 + }, + { + "epoch": 4.07667731629393, + "grad_norm": 0.11400903016328812, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5104 + }, + { + "epoch": 4.077476038338658, + "grad_norm": 0.06956811994314194, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5105 + }, + { + "epoch": 4.078274760383387, + "grad_norm": 0.09173833578824997, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5106 + }, + { + "epoch": 4.079073482428115, + "grad_norm": 0.09024006128311157, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5107 + }, + { + "epoch": 4.079872204472843, + "grad_norm": 0.04257406294345856, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5108 + }, + { + "epoch": 4.080670926517572, + "grad_norm": 0.04252707585692406, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5109 + }, + { + "epoch": 4.0814696485623, + "grad_norm": 0.052367035299539566, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5110 + }, + { + "epoch": 4.082268370607029, + "grad_norm": 0.06344939023256302, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5111 + }, + { + "epoch": 4.083067092651757, + "grad_norm": 0.04674215242266655, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5112 + }, + { + "epoch": 4.083865814696486, + "grad_norm": 0.03664534166455269, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 5113 + }, + { + "epoch": 4.084664536741214, + "grad_norm": 0.07198764383792877, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5114 + }, + { + "epoch": 4.085463258785943, + "grad_norm": 0.06294529885053635, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5115 + }, + { + "epoch": 4.086261980830671, + "grad_norm": 0.09595668315887451, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 5116 + }, + { + "epoch": 4.0870607028754, + "grad_norm": 0.09830893576145172, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5117 + }, + { + "epoch": 4.087859424920127, + "grad_norm": 0.09647611528635025, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5118 + }, + { + "epoch": 4.088658146964856, + "grad_norm": 0.04558149725198746, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5119 + }, + { + "epoch": 4.0894568690095845, + "grad_norm": 0.11090628057718277, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 5120 + }, + { + "epoch": 4.090255591054313, + "grad_norm": 0.1119648665189743, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5121 + }, + { + "epoch": 4.0910543130990416, + "grad_norm": 0.0372939296066761, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 5122 + }, + { + "epoch": 4.09185303514377, + "grad_norm": 0.10749047994613647, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5123 + }, + { + "epoch": 4.092651757188499, + "grad_norm": 0.08718341588973999, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5124 + }, + { + "epoch": 4.093450479233227, + "grad_norm": 0.04954478517174721, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5125 + }, + { + "epoch": 4.094249201277956, + "grad_norm": 0.0599503293633461, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5126 + }, + { + "epoch": 4.095047923322683, + "grad_norm": 0.04633599892258644, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5127 + }, + { + "epoch": 4.095846645367412, + "grad_norm": 0.0502074733376503, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5128 + }, + { + "epoch": 4.09664536741214, + "grad_norm": 0.1348472684621811, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5129 + }, + { + "epoch": 4.097444089456869, + "grad_norm": 0.07534858584403992, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5130 + }, + { + "epoch": 4.098242811501597, + "grad_norm": 0.04207107052206993, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 5131 + }, + { + "epoch": 4.099041533546326, + "grad_norm": 0.062090687453746796, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5132 + }, + { + "epoch": 4.0998402555910545, + "grad_norm": 0.08783479779958725, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 5133 + }, + { + "epoch": 4.100638977635783, + "grad_norm": 0.04489055275917053, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5134 + }, + { + "epoch": 4.1014376996805115, + "grad_norm": 0.07360105961561203, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 5135 + }, + { + "epoch": 4.102236421725239, + "grad_norm": 0.10253020375967026, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 5136 + }, + { + "epoch": 4.103035143769968, + "grad_norm": 0.12787389755249023, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5137 + }, + { + "epoch": 4.103833865814696, + "grad_norm": 0.43946513533592224, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5138 + }, + { + "epoch": 4.104632587859425, + "grad_norm": 0.7717093825340271, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5139 + }, + { + "epoch": 4.105431309904153, + "grad_norm": 0.1433849334716797, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5140 + }, + { + "epoch": 4.106230031948882, + "grad_norm": 0.09110052138566971, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5141 + }, + { + "epoch": 4.10702875399361, + "grad_norm": 0.13785111904144287, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5142 + }, + { + "epoch": 4.107827476038339, + "grad_norm": 0.0910695344209671, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5143 + }, + { + "epoch": 4.108626198083067, + "grad_norm": 0.10390721261501312, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5144 + }, + { + "epoch": 4.109424920127796, + "grad_norm": 0.07039178162813187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5145 + }, + { + "epoch": 4.110223642172524, + "grad_norm": 0.08536665886640549, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5146 + }, + { + "epoch": 4.111022364217252, + "grad_norm": 0.1355360597372055, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5147 + }, + { + "epoch": 4.111821086261981, + "grad_norm": 0.13981834053993225, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5148 + }, + { + "epoch": 4.112619808306709, + "grad_norm": 0.12653453648090363, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5149 + }, + { + "epoch": 4.113418530351438, + "grad_norm": 0.06805716454982758, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5150 + }, + { + "epoch": 4.114217252396166, + "grad_norm": 0.14361023902893066, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5151 + }, + { + "epoch": 4.115015974440895, + "grad_norm": 0.15223950147628784, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5152 + }, + { + "epoch": 4.115814696485623, + "grad_norm": 0.10013193637132645, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5153 + }, + { + "epoch": 4.116613418530352, + "grad_norm": 0.21049730479717255, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5154 + }, + { + "epoch": 4.11741214057508, + "grad_norm": 0.1393776834011078, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5155 + }, + { + "epoch": 4.118210862619808, + "grad_norm": 0.08584857732057571, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5156 + }, + { + "epoch": 4.1190095846645365, + "grad_norm": 0.06729432195425034, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5157 + }, + { + "epoch": 4.119808306709265, + "grad_norm": 0.08861853927373886, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5158 + }, + { + "epoch": 4.1206070287539935, + "grad_norm": 0.07037574052810669, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5159 + }, + { + "epoch": 4.121405750798722, + "grad_norm": 0.08049193024635315, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5160 + }, + { + "epoch": 4.122204472843451, + "grad_norm": 0.09040962159633636, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5161 + }, + { + "epoch": 4.123003194888179, + "grad_norm": 0.06531825661659241, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5162 + }, + { + "epoch": 4.123801916932908, + "grad_norm": 0.09423618763685226, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5163 + }, + { + "epoch": 4.124600638977636, + "grad_norm": 0.09436366707086563, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5164 + }, + { + "epoch": 4.125399361022364, + "grad_norm": 0.07543698698282242, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5165 + }, + { + "epoch": 4.126198083067092, + "grad_norm": 0.07491134852170944, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 5166 + }, + { + "epoch": 4.126996805111821, + "grad_norm": 0.09040437638759613, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5167 + }, + { + "epoch": 4.127795527156549, + "grad_norm": 0.11145798116922379, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5168 + }, + { + "epoch": 4.128594249201278, + "grad_norm": 0.35186707973480225, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5169 + }, + { + "epoch": 4.1293929712460065, + "grad_norm": 0.08744635432958603, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5170 + }, + { + "epoch": 4.130191693290735, + "grad_norm": 0.1078719049692154, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5171 + }, + { + "epoch": 4.1309904153354635, + "grad_norm": 0.13568760454654694, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5172 + }, + { + "epoch": 4.131789137380192, + "grad_norm": 0.10629335045814514, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5173 + }, + { + "epoch": 4.13258785942492, + "grad_norm": 0.3467697203159332, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5174 + }, + { + "epoch": 4.133386581469648, + "grad_norm": 0.5514479875564575, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5175 + }, + { + "epoch": 4.134185303514377, + "grad_norm": 0.2762874960899353, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5176 + }, + { + "epoch": 4.134984025559105, + "grad_norm": 0.25959524512290955, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5177 + }, + { + "epoch": 4.135782747603834, + "grad_norm": 0.26429036259651184, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5178 + }, + { + "epoch": 4.136581469648562, + "grad_norm": 0.4492235779762268, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5179 + }, + { + "epoch": 4.137380191693291, + "grad_norm": 0.3261977732181549, + "learning_rate": 0.0005, + "loss": 1.079, + "step": 5180 + }, + { + "epoch": 4.138178913738019, + "grad_norm": 0.15618108212947845, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5181 + }, + { + "epoch": 4.138977635782748, + "grad_norm": 0.2897289991378784, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5182 + }, + { + "epoch": 4.139776357827476, + "grad_norm": 0.2599884271621704, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5183 + }, + { + "epoch": 4.140575079872204, + "grad_norm": 0.3158198893070221, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5184 + }, + { + "epoch": 4.141373801916933, + "grad_norm": 0.2701073884963989, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5185 + }, + { + "epoch": 4.142172523961661, + "grad_norm": 0.14668017625808716, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5186 + }, + { + "epoch": 4.14297124600639, + "grad_norm": 0.14284202456474304, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5187 + }, + { + "epoch": 4.143769968051118, + "grad_norm": 0.1901128888130188, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5188 + }, + { + "epoch": 4.144568690095847, + "grad_norm": 0.17808575928211212, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5189 + }, + { + "epoch": 4.145367412140575, + "grad_norm": 0.11329478025436401, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5190 + }, + { + "epoch": 4.146166134185304, + "grad_norm": 0.10816467553377151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5191 + }, + { + "epoch": 4.146964856230032, + "grad_norm": 0.11593834310770035, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 5192 + }, + { + "epoch": 4.147763578274761, + "grad_norm": 0.17315705120563507, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5193 + }, + { + "epoch": 4.1485623003194885, + "grad_norm": 0.10884186625480652, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5194 + }, + { + "epoch": 4.149361022364217, + "grad_norm": 0.17528203129768372, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5195 + }, + { + "epoch": 4.1501597444089455, + "grad_norm": 0.3249641954898834, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5196 + }, + { + "epoch": 4.150958466453674, + "grad_norm": 0.2920859456062317, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5197 + }, + { + "epoch": 4.151757188498403, + "grad_norm": 0.12487918138504028, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5198 + }, + { + "epoch": 4.152555910543131, + "grad_norm": 0.07744348049163818, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 5199 + }, + { + "epoch": 4.15335463258786, + "grad_norm": 0.11721999943256378, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5200 + }, + { + "epoch": 4.154153354632588, + "grad_norm": 0.17566390335559845, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5201 + }, + { + "epoch": 4.154952076677317, + "grad_norm": 0.09762726724147797, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5202 + }, + { + "epoch": 4.155750798722044, + "grad_norm": 0.10769844055175781, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5203 + }, + { + "epoch": 4.156549520766773, + "grad_norm": 0.1608363389968872, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5204 + }, + { + "epoch": 4.157348242811501, + "grad_norm": 0.1575978696346283, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5205 + }, + { + "epoch": 4.15814696485623, + "grad_norm": 0.2035059779882431, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5206 + }, + { + "epoch": 4.1589456869009584, + "grad_norm": 0.1405210644006729, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5207 + }, + { + "epoch": 4.159744408945687, + "grad_norm": 0.18898408114910126, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5208 + }, + { + "epoch": 4.1605431309904155, + "grad_norm": 0.20012563467025757, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5209 + }, + { + "epoch": 4.161341853035144, + "grad_norm": 0.14585568010807037, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5210 + }, + { + "epoch": 4.162140575079873, + "grad_norm": 0.166448175907135, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5211 + }, + { + "epoch": 4.1629392971246, + "grad_norm": 0.08768735080957413, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5212 + }, + { + "epoch": 4.163738019169329, + "grad_norm": 0.12429258227348328, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5213 + }, + { + "epoch": 4.164536741214057, + "grad_norm": 0.06750953942537308, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 5214 + }, + { + "epoch": 4.165335463258786, + "grad_norm": 0.10137717425823212, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5215 + }, + { + "epoch": 4.166134185303514, + "grad_norm": 0.1015368178486824, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5216 + }, + { + "epoch": 4.166932907348243, + "grad_norm": 0.12396319955587387, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5217 + }, + { + "epoch": 4.167731629392971, + "grad_norm": 0.11295704543590546, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5218 + }, + { + "epoch": 4.1685303514377, + "grad_norm": 0.1415906846523285, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5219 + }, + { + "epoch": 4.169329073482428, + "grad_norm": 0.1300252079963684, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5220 + }, + { + "epoch": 4.170127795527157, + "grad_norm": 0.09486760199069977, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5221 + }, + { + "epoch": 4.170926517571885, + "grad_norm": 0.25776198506355286, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5222 + }, + { + "epoch": 4.171725239616613, + "grad_norm": 0.07684944570064545, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5223 + }, + { + "epoch": 4.172523961661342, + "grad_norm": 0.06909538060426712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5224 + }, + { + "epoch": 4.17332268370607, + "grad_norm": 0.09686419367790222, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5225 + }, + { + "epoch": 4.174121405750799, + "grad_norm": 0.10760180652141571, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5226 + }, + { + "epoch": 4.174920127795527, + "grad_norm": 0.0963902473449707, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5227 + }, + { + "epoch": 4.175718849840256, + "grad_norm": 0.12986192107200623, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5228 + }, + { + "epoch": 4.176517571884984, + "grad_norm": 0.12532354891300201, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5229 + }, + { + "epoch": 4.177316293929713, + "grad_norm": 0.158639058470726, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5230 + }, + { + "epoch": 4.178115015974441, + "grad_norm": 0.10025905817747116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5231 + }, + { + "epoch": 4.178913738019169, + "grad_norm": 0.19150952994823456, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5232 + }, + { + "epoch": 4.1797124600638975, + "grad_norm": 0.10650201886892319, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5233 + }, + { + "epoch": 4.180511182108626, + "grad_norm": 0.08948210626840591, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5234 + }, + { + "epoch": 4.181309904153355, + "grad_norm": 0.144260972738266, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5235 + }, + { + "epoch": 4.182108626198083, + "grad_norm": 0.10631201416254044, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5236 + }, + { + "epoch": 4.182907348242812, + "grad_norm": 0.17884188890457153, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5237 + }, + { + "epoch": 4.18370607028754, + "grad_norm": 0.12393054366111755, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5238 + }, + { + "epoch": 4.184504792332269, + "grad_norm": 0.10113117098808289, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5239 + }, + { + "epoch": 4.185303514376997, + "grad_norm": 0.08745535463094711, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5240 + }, + { + "epoch": 4.186102236421725, + "grad_norm": 0.12319829314947128, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5241 + }, + { + "epoch": 4.186900958466453, + "grad_norm": 0.10202868282794952, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5242 + }, + { + "epoch": 4.187699680511182, + "grad_norm": 0.12799306213855743, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5243 + }, + { + "epoch": 4.18849840255591, + "grad_norm": 0.10247227549552917, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5244 + }, + { + "epoch": 4.189297124600639, + "grad_norm": 0.0876200944185257, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5245 + }, + { + "epoch": 4.1900958466453675, + "grad_norm": 0.08829693496227264, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5246 + }, + { + "epoch": 4.190894568690096, + "grad_norm": 0.09005091339349747, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5247 + }, + { + "epoch": 4.1916932907348246, + "grad_norm": 0.06715424358844757, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5248 + }, + { + "epoch": 4.192492012779553, + "grad_norm": 0.11082255840301514, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5249 + }, + { + "epoch": 4.193290734824281, + "grad_norm": 0.08197743445634842, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5250 + }, + { + "epoch": 4.194089456869009, + "grad_norm": 0.08641887456178665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5251 + }, + { + "epoch": 4.194888178913738, + "grad_norm": 0.29264676570892334, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5252 + }, + { + "epoch": 4.195686900958466, + "grad_norm": 0.10122201591730118, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5253 + }, + { + "epoch": 4.196485623003195, + "grad_norm": 0.13220930099487305, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5254 + }, + { + "epoch": 4.197284345047923, + "grad_norm": 0.05919777229428291, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5255 + }, + { + "epoch": 4.198083067092652, + "grad_norm": 0.15947407484054565, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5256 + }, + { + "epoch": 4.19888178913738, + "grad_norm": 0.08046088367700577, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5257 + }, + { + "epoch": 4.199680511182109, + "grad_norm": 0.08504491299390793, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5258 + }, + { + "epoch": 4.2004792332268375, + "grad_norm": 0.2523876428604126, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5259 + }, + { + "epoch": 4.201277955271565, + "grad_norm": 0.32436496019363403, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5260 + }, + { + "epoch": 4.202076677316294, + "grad_norm": 0.3832956552505493, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5261 + }, + { + "epoch": 4.202875399361022, + "grad_norm": 0.15481804311275482, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5262 + }, + { + "epoch": 4.203674121405751, + "grad_norm": 0.5061212182044983, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5263 + }, + { + "epoch": 4.204472843450479, + "grad_norm": 0.2778873145580292, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5264 + }, + { + "epoch": 4.205271565495208, + "grad_norm": 0.10782434046268463, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5265 + }, + { + "epoch": 4.206070287539936, + "grad_norm": 0.2730430066585541, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5266 + }, + { + "epoch": 4.206869009584665, + "grad_norm": 0.14902958273887634, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5267 + }, + { + "epoch": 4.207667731629393, + "grad_norm": 0.2455812245607376, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5268 + }, + { + "epoch": 4.208466453674121, + "grad_norm": 0.36285653710365295, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5269 + }, + { + "epoch": 4.2092651757188495, + "grad_norm": 0.16104358434677124, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5270 + }, + { + "epoch": 4.210063897763578, + "grad_norm": 0.10330995172262192, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5271 + }, + { + "epoch": 4.210862619808307, + "grad_norm": 0.14438849687576294, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5272 + }, + { + "epoch": 4.211661341853035, + "grad_norm": 0.11719724535942078, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5273 + }, + { + "epoch": 4.212460063897764, + "grad_norm": 0.13503463566303253, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5274 + }, + { + "epoch": 4.213258785942492, + "grad_norm": 0.12717710435390472, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5275 + }, + { + "epoch": 4.214057507987221, + "grad_norm": 0.12293769419193268, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5276 + }, + { + "epoch": 4.214856230031949, + "grad_norm": 0.11828786134719849, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5277 + }, + { + "epoch": 4.215654952076678, + "grad_norm": 0.11118468642234802, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5278 + }, + { + "epoch": 4.216453674121405, + "grad_norm": 0.15688025951385498, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5279 + }, + { + "epoch": 4.217252396166134, + "grad_norm": 0.10603991895914078, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 5280 + }, + { + "epoch": 4.218051118210862, + "grad_norm": 0.14034971594810486, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5281 + }, + { + "epoch": 4.218849840255591, + "grad_norm": 0.21270571649074554, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5282 + }, + { + "epoch": 4.2196485623003195, + "grad_norm": 0.17699144780635834, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5283 + }, + { + "epoch": 4.220447284345048, + "grad_norm": 0.07665220648050308, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5284 + }, + { + "epoch": 4.2212460063897765, + "grad_norm": 0.13917282223701477, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5285 + }, + { + "epoch": 4.222044728434505, + "grad_norm": 0.1253320872783661, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5286 + }, + { + "epoch": 4.222843450479234, + "grad_norm": 0.07693646103143692, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5287 + }, + { + "epoch": 4.223642172523961, + "grad_norm": 0.11877891421318054, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5288 + }, + { + "epoch": 4.22444089456869, + "grad_norm": 0.08900399506092072, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5289 + }, + { + "epoch": 4.225239616613418, + "grad_norm": 0.08575741946697235, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5290 + }, + { + "epoch": 4.226038338658147, + "grad_norm": 0.11078973859548569, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5291 + }, + { + "epoch": 4.226837060702875, + "grad_norm": 0.12371394783258438, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5292 + }, + { + "epoch": 4.227635782747604, + "grad_norm": 0.11741651594638824, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5293 + }, + { + "epoch": 4.228434504792332, + "grad_norm": 0.1316244751214981, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5294 + }, + { + "epoch": 4.229233226837061, + "grad_norm": 0.07751733064651489, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5295 + }, + { + "epoch": 4.2300319488817895, + "grad_norm": 0.13512739539146423, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5296 + }, + { + "epoch": 4.230830670926518, + "grad_norm": 0.14408327639102936, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5297 + }, + { + "epoch": 4.231629392971246, + "grad_norm": 0.05596759170293808, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5298 + }, + { + "epoch": 4.232428115015974, + "grad_norm": 0.20518198609352112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5299 + }, + { + "epoch": 4.233226837060703, + "grad_norm": 0.17000356316566467, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5300 + }, + { + "epoch": 4.234025559105431, + "grad_norm": 0.10213350504636765, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5301 + }, + { + "epoch": 4.23482428115016, + "grad_norm": 0.1633368879556656, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5302 + }, + { + "epoch": 4.235623003194888, + "grad_norm": 0.17330236732959747, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5303 + }, + { + "epoch": 4.236421725239617, + "grad_norm": 0.20028679072856903, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5304 + }, + { + "epoch": 4.237220447284345, + "grad_norm": 0.23386533558368683, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5305 + }, + { + "epoch": 4.238019169329074, + "grad_norm": 0.051739469170570374, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5306 + }, + { + "epoch": 4.2388178913738015, + "grad_norm": 0.19732257723808289, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5307 + }, + { + "epoch": 4.23961661341853, + "grad_norm": 0.1318890005350113, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5308 + }, + { + "epoch": 4.2404153354632586, + "grad_norm": 0.17188113927841187, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5309 + }, + { + "epoch": 4.241214057507987, + "grad_norm": 0.23981456458568573, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5310 + }, + { + "epoch": 4.242012779552716, + "grad_norm": 0.15658913552761078, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5311 + }, + { + "epoch": 4.242811501597444, + "grad_norm": 0.13481132686138153, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5312 + }, + { + "epoch": 4.243610223642173, + "grad_norm": 0.16327355802059174, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5313 + }, + { + "epoch": 4.244408945686901, + "grad_norm": 0.0873674675822258, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5314 + }, + { + "epoch": 4.24520766773163, + "grad_norm": 0.16612505912780762, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5315 + }, + { + "epoch": 4.246006389776358, + "grad_norm": 0.15376444160938263, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5316 + }, + { + "epoch": 4.246805111821086, + "grad_norm": 0.07853512465953827, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5317 + }, + { + "epoch": 4.247603833865814, + "grad_norm": 0.11799992620944977, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5318 + }, + { + "epoch": 4.248402555910543, + "grad_norm": 0.09121575206518173, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 5319 + }, + { + "epoch": 4.2492012779552715, + "grad_norm": 0.09780153632164001, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5320 + }, + { + "epoch": 4.25, + "grad_norm": 0.11387690156698227, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 5321 + }, + { + "epoch": 4.2507987220447285, + "grad_norm": 0.08085697889328003, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 5322 + }, + { + "epoch": 4.251597444089457, + "grad_norm": 0.09986089169979095, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5323 + }, + { + "epoch": 4.252396166134186, + "grad_norm": 0.07728606462478638, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5324 + }, + { + "epoch": 4.253194888178914, + "grad_norm": 0.07464555650949478, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5325 + }, + { + "epoch": 4.253993610223642, + "grad_norm": 0.05129759758710861, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5326 + }, + { + "epoch": 4.25479233226837, + "grad_norm": 0.060275599360466, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5327 + }, + { + "epoch": 4.255591054313099, + "grad_norm": 0.07773016393184662, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5328 + }, + { + "epoch": 4.256389776357827, + "grad_norm": 0.1046462282538414, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 5329 + }, + { + "epoch": 4.257188498402556, + "grad_norm": 0.1184321865439415, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5330 + }, + { + "epoch": 4.257987220447284, + "grad_norm": 0.1419631987810135, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5331 + }, + { + "epoch": 4.258785942492013, + "grad_norm": 0.10022144019603729, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5332 + }, + { + "epoch": 4.2595846645367414, + "grad_norm": 0.075701504945755, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5333 + }, + { + "epoch": 4.26038338658147, + "grad_norm": 0.18145573139190674, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5334 + }, + { + "epoch": 4.261182108626198, + "grad_norm": 0.06092703342437744, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5335 + }, + { + "epoch": 4.261980830670926, + "grad_norm": 0.13196219503879547, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5336 + }, + { + "epoch": 4.262779552715655, + "grad_norm": 0.17139793932437897, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5337 + }, + { + "epoch": 4.263578274760383, + "grad_norm": 0.12072623521089554, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5338 + }, + { + "epoch": 4.264376996805112, + "grad_norm": 0.11874449253082275, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5339 + }, + { + "epoch": 4.26517571884984, + "grad_norm": 0.10718921571969986, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5340 + }, + { + "epoch": 4.265974440894569, + "grad_norm": 0.07337968051433563, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 5341 + }, + { + "epoch": 4.266773162939297, + "grad_norm": 0.11872536689043045, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5342 + }, + { + "epoch": 4.267571884984026, + "grad_norm": 0.11199923604726791, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5343 + }, + { + "epoch": 4.268370607028754, + "grad_norm": 0.05864759162068367, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5344 + }, + { + "epoch": 4.269169329073483, + "grad_norm": 0.14757969975471497, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5345 + }, + { + "epoch": 4.2699680511182105, + "grad_norm": 0.12190169841051102, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5346 + }, + { + "epoch": 4.270766773162939, + "grad_norm": 0.0532461479306221, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5347 + }, + { + "epoch": 4.271565495207668, + "grad_norm": 0.10723208636045456, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5348 + }, + { + "epoch": 4.272364217252396, + "grad_norm": 0.07115229964256287, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5349 + }, + { + "epoch": 4.273162939297125, + "grad_norm": 0.07450878620147705, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 5350 + }, + { + "epoch": 4.273961661341853, + "grad_norm": 0.11793115735054016, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5351 + }, + { + "epoch": 4.274760383386582, + "grad_norm": 0.10440219938755035, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5352 + }, + { + "epoch": 4.27555910543131, + "grad_norm": 0.27991926670074463, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5353 + }, + { + "epoch": 4.276357827476039, + "grad_norm": 0.11090446263551712, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5354 + }, + { + "epoch": 4.277156549520766, + "grad_norm": 0.10509627312421799, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5355 + }, + { + "epoch": 4.277955271565495, + "grad_norm": 0.06217970326542854, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5356 + }, + { + "epoch": 4.2787539936102235, + "grad_norm": 0.34369224309921265, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5357 + }, + { + "epoch": 4.279552715654952, + "grad_norm": 0.1246214285492897, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5358 + }, + { + "epoch": 4.2803514376996805, + "grad_norm": 0.06331677734851837, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 5359 + }, + { + "epoch": 4.281150159744409, + "grad_norm": 0.08274740725755692, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5360 + }, + { + "epoch": 4.281948881789138, + "grad_norm": 0.06133527308702469, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5361 + }, + { + "epoch": 4.282747603833866, + "grad_norm": 0.09867174178361893, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5362 + }, + { + "epoch": 4.283546325878595, + "grad_norm": 0.09370579570531845, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5363 + }, + { + "epoch": 4.284345047923322, + "grad_norm": 0.2549540400505066, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5364 + }, + { + "epoch": 4.285143769968051, + "grad_norm": 0.1900271773338318, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5365 + }, + { + "epoch": 4.285942492012779, + "grad_norm": 0.21450525522232056, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 5366 + }, + { + "epoch": 4.286741214057508, + "grad_norm": 0.1381012350320816, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 5367 + }, + { + "epoch": 4.287539936102236, + "grad_norm": 0.0813983827829361, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5368 + }, + { + "epoch": 4.288338658146965, + "grad_norm": 0.16513130068778992, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5369 + }, + { + "epoch": 4.289137380191693, + "grad_norm": 0.10825667530298233, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5370 + }, + { + "epoch": 4.289936102236422, + "grad_norm": 0.07226242125034332, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5371 + }, + { + "epoch": 4.2907348242811505, + "grad_norm": 0.1278400719165802, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5372 + }, + { + "epoch": 4.291533546325878, + "grad_norm": 0.11092592030763626, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5373 + }, + { + "epoch": 4.292332268370607, + "grad_norm": 0.08732229471206665, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 5374 + }, + { + "epoch": 4.293130990415335, + "grad_norm": 0.2182341367006302, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5375 + }, + { + "epoch": 4.293929712460064, + "grad_norm": 0.10107403993606567, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5376 + }, + { + "epoch": 4.294728434504792, + "grad_norm": 0.13586364686489105, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5377 + }, + { + "epoch": 4.295527156549521, + "grad_norm": 0.3685734272003174, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5378 + }, + { + "epoch": 4.296325878594249, + "grad_norm": 0.13060712814331055, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5379 + }, + { + "epoch": 4.297124600638978, + "grad_norm": 0.05988436937332153, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5380 + }, + { + "epoch": 4.297923322683706, + "grad_norm": 0.14392045140266418, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 5381 + }, + { + "epoch": 4.298722044728435, + "grad_norm": 0.25003254413604736, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5382 + }, + { + "epoch": 4.2995207667731625, + "grad_norm": 0.055451687425374985, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5383 + }, + { + "epoch": 4.300319488817891, + "grad_norm": 0.11186914891004562, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5384 + }, + { + "epoch": 4.30111821086262, + "grad_norm": 0.11314704269170761, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5385 + }, + { + "epoch": 4.301916932907348, + "grad_norm": 0.43445560336112976, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5386 + }, + { + "epoch": 4.302715654952077, + "grad_norm": 0.09362242370843887, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5387 + }, + { + "epoch": 4.303514376996805, + "grad_norm": 0.04405852034687996, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5388 + }, + { + "epoch": 4.304313099041534, + "grad_norm": 0.12615318596363068, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5389 + }, + { + "epoch": 4.305111821086262, + "grad_norm": 0.1067153736948967, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5390 + }, + { + "epoch": 4.305910543130991, + "grad_norm": 0.05732683837413788, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5391 + }, + { + "epoch": 4.306709265175719, + "grad_norm": 0.2452571988105774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5392 + }, + { + "epoch": 4.307507987220447, + "grad_norm": 0.11733133345842361, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 5393 + }, + { + "epoch": 4.3083067092651754, + "grad_norm": 0.06771894544363022, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5394 + }, + { + "epoch": 4.309105431309904, + "grad_norm": 0.12928563356399536, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5395 + }, + { + "epoch": 4.3099041533546325, + "grad_norm": 0.1777956187725067, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5396 + }, + { + "epoch": 4.310702875399361, + "grad_norm": 0.1281544715166092, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5397 + }, + { + "epoch": 4.31150159744409, + "grad_norm": 0.07120000571012497, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 5398 + }, + { + "epoch": 4.312300319488818, + "grad_norm": 0.1270848512649536, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5399 + }, + { + "epoch": 4.313099041533547, + "grad_norm": 0.17685648798942566, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5400 + }, + { + "epoch": 4.313897763578275, + "grad_norm": 0.05070900544524193, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5401 + }, + { + "epoch": 4.314696485623003, + "grad_norm": 0.10543418675661087, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5402 + }, + { + "epoch": 4.315495207667731, + "grad_norm": 0.12336398661136627, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5403 + }, + { + "epoch": 4.31629392971246, + "grad_norm": 0.1583624631166458, + "learning_rate": 0.0005, + "loss": 1.0446, + "step": 5404 + }, + { + "epoch": 4.317092651757188, + "grad_norm": 0.08186022192239761, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5405 + }, + { + "epoch": 4.317891373801917, + "grad_norm": 0.07562705129384995, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5406 + }, + { + "epoch": 4.318690095846645, + "grad_norm": 0.05275554209947586, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5407 + }, + { + "epoch": 4.319488817891374, + "grad_norm": 0.06432928144931793, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5408 + }, + { + "epoch": 4.3202875399361025, + "grad_norm": 0.08220377564430237, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5409 + }, + { + "epoch": 4.321086261980831, + "grad_norm": 0.07882758229970932, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 5410 + }, + { + "epoch": 4.321884984025559, + "grad_norm": 0.138245090842247, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 5411 + }, + { + "epoch": 4.322683706070287, + "grad_norm": 0.1127534806728363, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5412 + }, + { + "epoch": 4.323482428115016, + "grad_norm": 0.1985669732093811, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5413 + }, + { + "epoch": 4.324281150159744, + "grad_norm": 0.08023711293935776, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5414 + }, + { + "epoch": 4.325079872204473, + "grad_norm": 0.13853015005588531, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 5415 + }, + { + "epoch": 4.325878594249201, + "grad_norm": 0.18319782614707947, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5416 + }, + { + "epoch": 4.32667731629393, + "grad_norm": 0.073015958070755, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5417 + }, + { + "epoch": 4.327476038338658, + "grad_norm": 0.10771846771240234, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5418 + }, + { + "epoch": 4.328274760383387, + "grad_norm": 0.09512028843164444, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 5419 + }, + { + "epoch": 4.329073482428115, + "grad_norm": 0.0822201818227768, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5420 + }, + { + "epoch": 4.329872204472843, + "grad_norm": 0.11839213222265244, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5421 + }, + { + "epoch": 4.330670926517572, + "grad_norm": 0.10274796187877655, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5422 + }, + { + "epoch": 4.3314696485623, + "grad_norm": 0.05896717682480812, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5423 + }, + { + "epoch": 4.332268370607029, + "grad_norm": 0.1268780380487442, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5424 + }, + { + "epoch": 4.333067092651757, + "grad_norm": 0.09173188358545303, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5425 + }, + { + "epoch": 4.333865814696486, + "grad_norm": 0.05155360326170921, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5426 + }, + { + "epoch": 4.334664536741214, + "grad_norm": 0.08836793899536133, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5427 + }, + { + "epoch": 4.335463258785943, + "grad_norm": 0.08620470017194748, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5428 + }, + { + "epoch": 4.336261980830671, + "grad_norm": 0.06972123682498932, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5429 + }, + { + "epoch": 4.3370607028754, + "grad_norm": 0.12461638450622559, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5430 + }, + { + "epoch": 4.337859424920127, + "grad_norm": 0.08546463400125504, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5431 + }, + { + "epoch": 4.338658146964856, + "grad_norm": 0.08495177328586578, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5432 + }, + { + "epoch": 4.3394568690095845, + "grad_norm": 0.13017377257347107, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5433 + }, + { + "epoch": 4.340255591054313, + "grad_norm": 0.13619504868984222, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5434 + }, + { + "epoch": 4.3410543130990416, + "grad_norm": 0.5835675597190857, + "learning_rate": 0.0005, + "loss": 1.0435, + "step": 5435 + }, + { + "epoch": 4.34185303514377, + "grad_norm": 0.09355206042528152, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5436 + }, + { + "epoch": 4.342651757188499, + "grad_norm": 0.08626751601696014, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5437 + }, + { + "epoch": 4.343450479233227, + "grad_norm": 0.05652647092938423, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5438 + }, + { + "epoch": 4.344249201277956, + "grad_norm": 0.05232316255569458, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5439 + }, + { + "epoch": 4.345047923322683, + "grad_norm": 0.08115233480930328, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5440 + }, + { + "epoch": 4.345846645367412, + "grad_norm": 0.08757120370864868, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5441 + }, + { + "epoch": 4.34664536741214, + "grad_norm": 0.046224139630794525, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5442 + }, + { + "epoch": 4.347444089456869, + "grad_norm": 0.07967934757471085, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5443 + }, + { + "epoch": 4.348242811501597, + "grad_norm": 0.044298652559518814, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5444 + }, + { + "epoch": 4.349041533546326, + "grad_norm": 0.09021158516407013, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 5445 + }, + { + "epoch": 4.3498402555910545, + "grad_norm": 0.12857890129089355, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5446 + }, + { + "epoch": 4.350638977635783, + "grad_norm": 0.05655589699745178, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 5447 + }, + { + "epoch": 4.3514376996805115, + "grad_norm": 0.09304624050855637, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5448 + }, + { + "epoch": 4.352236421725239, + "grad_norm": 0.19815632700920105, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5449 + }, + { + "epoch": 4.353035143769968, + "grad_norm": 0.0526299886405468, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5450 + }, + { + "epoch": 4.353833865814696, + "grad_norm": 0.06432242691516876, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5451 + }, + { + "epoch": 4.354632587859425, + "grad_norm": 0.07848794758319855, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5452 + }, + { + "epoch": 4.355431309904153, + "grad_norm": 0.08260536193847656, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5453 + }, + { + "epoch": 4.356230031948882, + "grad_norm": 0.052810169756412506, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 5454 + }, + { + "epoch": 4.35702875399361, + "grad_norm": 0.06942226737737656, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 5455 + }, + { + "epoch": 4.357827476038339, + "grad_norm": 0.13892871141433716, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5456 + }, + { + "epoch": 4.358626198083067, + "grad_norm": 0.15982909500598907, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5457 + }, + { + "epoch": 4.359424920127796, + "grad_norm": 0.08206653594970703, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 5458 + }, + { + "epoch": 4.360223642172524, + "grad_norm": 0.08957790583372116, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 5459 + }, + { + "epoch": 4.361022364217252, + "grad_norm": 0.03882770985364914, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5460 + }, + { + "epoch": 4.361821086261981, + "grad_norm": 0.0928555279970169, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5461 + }, + { + "epoch": 4.362619808306709, + "grad_norm": 0.057321447879076004, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5462 + }, + { + "epoch": 4.363418530351438, + "grad_norm": 0.0737103596329689, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 5463 + }, + { + "epoch": 4.364217252396166, + "grad_norm": 0.06696293503046036, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5464 + }, + { + "epoch": 4.365015974440895, + "grad_norm": 0.04572489857673645, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 5465 + }, + { + "epoch": 4.365814696485623, + "grad_norm": 0.094516322016716, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 5466 + }, + { + "epoch": 4.366613418530352, + "grad_norm": 0.045576825737953186, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5467 + }, + { + "epoch": 4.36741214057508, + "grad_norm": 0.06839725375175476, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 5468 + }, + { + "epoch": 4.368210862619808, + "grad_norm": 0.14465193450450897, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5469 + }, + { + "epoch": 4.3690095846645365, + "grad_norm": 0.07930073887109756, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5470 + }, + { + "epoch": 4.369808306709265, + "grad_norm": 0.06120619550347328, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5471 + }, + { + "epoch": 4.3706070287539935, + "grad_norm": 0.066256083548069, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 5472 + }, + { + "epoch": 4.371405750798722, + "grad_norm": 0.11696353554725647, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5473 + }, + { + "epoch": 4.372204472843451, + "grad_norm": 0.11530395597219467, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5474 + }, + { + "epoch": 4.373003194888179, + "grad_norm": 0.05663579702377319, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 5475 + }, + { + "epoch": 4.373801916932908, + "grad_norm": 0.1241946592926979, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5476 + }, + { + "epoch": 4.374600638977636, + "grad_norm": 0.1725323498249054, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5477 + }, + { + "epoch": 4.375399361022364, + "grad_norm": 0.09785371273756027, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5478 + }, + { + "epoch": 4.376198083067092, + "grad_norm": 0.0813792496919632, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5479 + }, + { + "epoch": 4.376996805111821, + "grad_norm": 0.17471592128276825, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5480 + }, + { + "epoch": 4.377795527156549, + "grad_norm": 0.1923220455646515, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5481 + }, + { + "epoch": 4.378594249201278, + "grad_norm": 0.09857932478189468, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5482 + }, + { + "epoch": 4.3793929712460065, + "grad_norm": 0.10073419660329819, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5483 + }, + { + "epoch": 4.380191693290735, + "grad_norm": 0.35731273889541626, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5484 + }, + { + "epoch": 4.3809904153354635, + "grad_norm": 0.12060656398534775, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 5485 + }, + { + "epoch": 4.381789137380192, + "grad_norm": 0.10264381766319275, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5486 + }, + { + "epoch": 4.38258785942492, + "grad_norm": 0.0868317037820816, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5487 + }, + { + "epoch": 4.383386581469648, + "grad_norm": 0.07722344994544983, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5488 + }, + { + "epoch": 4.384185303514377, + "grad_norm": 0.3690173327922821, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5489 + }, + { + "epoch": 4.384984025559105, + "grad_norm": 0.18400169909000397, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5490 + }, + { + "epoch": 4.385782747603834, + "grad_norm": 0.14671844244003296, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5491 + }, + { + "epoch": 4.386581469648562, + "grad_norm": 0.05277179554104805, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 5492 + }, + { + "epoch": 4.387380191693291, + "grad_norm": 0.13593660295009613, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5493 + }, + { + "epoch": 4.388178913738019, + "grad_norm": 0.1318334937095642, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 5494 + }, + { + "epoch": 4.388977635782748, + "grad_norm": 0.07189908623695374, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5495 + }, + { + "epoch": 4.389776357827476, + "grad_norm": 0.07969736307859421, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5496 + }, + { + "epoch": 4.390575079872204, + "grad_norm": 0.07449150085449219, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5497 + }, + { + "epoch": 4.391373801916933, + "grad_norm": 0.533295214176178, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5498 + }, + { + "epoch": 4.392172523961661, + "grad_norm": 0.10412111133337021, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 5499 + }, + { + "epoch": 4.39297124600639, + "grad_norm": 0.08482066541910172, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 5500 + }, + { + "epoch": 4.393769968051118, + "grad_norm": 0.08023949712514877, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5501 + }, + { + "epoch": 4.394568690095847, + "grad_norm": 0.16967490315437317, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5502 + }, + { + "epoch": 4.395367412140575, + "grad_norm": 0.1979716271162033, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5503 + }, + { + "epoch": 4.396166134185304, + "grad_norm": 0.09058263152837753, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 5504 + }, + { + "epoch": 4.396964856230032, + "grad_norm": 0.13149574398994446, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 5505 + }, + { + "epoch": 4.397763578274761, + "grad_norm": 0.08240146189928055, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5506 + }, + { + "epoch": 4.3985623003194885, + "grad_norm": 0.13789936900138855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 5507 + }, + { + "epoch": 4.399361022364217, + "grad_norm": 0.18576087057590485, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5508 + }, + { + "epoch": 4.4001597444089455, + "grad_norm": 0.13780297338962555, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 5509 + }, + { + "epoch": 4.400958466453674, + "grad_norm": 0.14724896848201752, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5510 + }, + { + "epoch": 4.401757188498403, + "grad_norm": 0.20418551564216614, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 5511 + }, + { + "epoch": 4.402555910543131, + "grad_norm": 0.1841040551662445, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 5512 + }, + { + "epoch": 4.40335463258786, + "grad_norm": 0.6994684338569641, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 5513 + }, + { + "epoch": 4.404153354632588, + "grad_norm": 0.18882393836975098, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5514 + }, + { + "epoch": 4.404952076677317, + "grad_norm": 0.07170864939689636, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 5515 + }, + { + "epoch": 4.405750798722044, + "grad_norm": 0.04765893518924713, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5516 + }, + { + "epoch": 4.406549520766773, + "grad_norm": 0.07294443249702454, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 5517 + }, + { + "epoch": 4.407348242811501, + "grad_norm": 0.18566831946372986, + "learning_rate": 0.0005, + "loss": 1.0434, + "step": 5518 + }, + { + "epoch": 4.40814696485623, + "grad_norm": 0.10881441831588745, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 5519 + }, + { + "epoch": 4.4089456869009584, + "grad_norm": 0.380438894033432, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 5520 + }, + { + "epoch": 4.409744408945687, + "grad_norm": 0.19281962513923645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 5521 + }, + { + "epoch": 4.4105431309904155, + "grad_norm": 0.05730361491441727, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5522 + }, + { + "epoch": 4.411341853035144, + "grad_norm": 0.09276643395423889, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5523 + }, + { + "epoch": 4.412140575079873, + "grad_norm": 0.070807084441185, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5524 + }, + { + "epoch": 4.4129392971246, + "grad_norm": 0.08902080357074738, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5525 + }, + { + "epoch": 4.413738019169329, + "grad_norm": 0.14861932396888733, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5526 + }, + { + "epoch": 4.414536741214057, + "grad_norm": 0.2678995728492737, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 5527 + }, + { + "epoch": 4.415335463258786, + "grad_norm": 0.12902382016181946, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5528 + }, + { + "epoch": 4.416134185303514, + "grad_norm": 0.14999063313007355, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 5529 + }, + { + "epoch": 4.416932907348243, + "grad_norm": 0.13950730860233307, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5530 + }, + { + "epoch": 4.417731629392971, + "grad_norm": 0.12215374410152435, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5531 + }, + { + "epoch": 4.4185303514377, + "grad_norm": 0.12941284477710724, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5532 + }, + { + "epoch": 4.419329073482428, + "grad_norm": 0.22524291276931763, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5533 + }, + { + "epoch": 4.420127795527157, + "grad_norm": 0.0830528736114502, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 5534 + }, + { + "epoch": 4.420926517571885, + "grad_norm": 0.1562981903553009, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5535 + }, + { + "epoch": 4.421725239616613, + "grad_norm": 0.19052654504776, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5536 + }, + { + "epoch": 4.422523961661342, + "grad_norm": 0.12264347821474075, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5537 + }, + { + "epoch": 4.42332268370607, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5538 + }, + { + "epoch": 4.424121405750799, + "grad_norm": 0.1412813812494278, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5539 + }, + { + "epoch": 4.424920127795527, + "grad_norm": 0.17808450758457184, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5540 + }, + { + "epoch": 4.425718849840256, + "grad_norm": 0.43806061148643494, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5541 + }, + { + "epoch": 4.426517571884984, + "grad_norm": 0.17728228867053986, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5542 + }, + { + "epoch": 4.427316293929713, + "grad_norm": 0.12434227764606476, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5543 + }, + { + "epoch": 4.428115015974441, + "grad_norm": 0.10051420331001282, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5544 + }, + { + "epoch": 4.428913738019169, + "grad_norm": 0.0943203940987587, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5545 + }, + { + "epoch": 4.4297124600638975, + "grad_norm": 0.08082996308803558, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5546 + }, + { + "epoch": 4.430511182108626, + "grad_norm": 0.13405202329158783, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5547 + }, + { + "epoch": 4.431309904153355, + "grad_norm": 0.10448389500379562, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5548 + }, + { + "epoch": 4.432108626198083, + "grad_norm": 0.32405009865760803, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 5549 + }, + { + "epoch": 4.432907348242812, + "grad_norm": 0.09690065681934357, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5550 + }, + { + "epoch": 4.43370607028754, + "grad_norm": 0.35410076379776, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5551 + }, + { + "epoch": 4.434504792332269, + "grad_norm": 0.17826306819915771, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5552 + }, + { + "epoch": 4.435303514376997, + "grad_norm": 0.2252579778432846, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5553 + }, + { + "epoch": 4.436102236421725, + "grad_norm": 0.09508918970823288, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 5554 + }, + { + "epoch": 4.436900958466453, + "grad_norm": 0.16872358322143555, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5555 + }, + { + "epoch": 4.437699680511182, + "grad_norm": 0.24836355447769165, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5556 + }, + { + "epoch": 4.43849840255591, + "grad_norm": 0.20887835323810577, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5557 + }, + { + "epoch": 4.439297124600639, + "grad_norm": 0.10922685265541077, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5558 + }, + { + "epoch": 4.4400958466453675, + "grad_norm": 0.44561028480529785, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5559 + }, + { + "epoch": 4.440894568690096, + "grad_norm": 0.18160179257392883, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5560 + }, + { + "epoch": 4.4416932907348246, + "grad_norm": 0.06924877315759659, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5561 + }, + { + "epoch": 4.442492012779553, + "grad_norm": 0.15605933964252472, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 5562 + }, + { + "epoch": 4.443290734824281, + "grad_norm": 0.10880772024393082, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 5563 + }, + { + "epoch": 4.444089456869009, + "grad_norm": 0.1252668797969818, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5564 + }, + { + "epoch": 4.444888178913738, + "grad_norm": 0.20452634990215302, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5565 + }, + { + "epoch": 4.445686900958466, + "grad_norm": 0.20973001420497894, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 5566 + }, + { + "epoch": 4.446485623003195, + "grad_norm": 0.07631060481071472, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5567 + }, + { + "epoch": 4.447284345047923, + "grad_norm": 0.14793622493743896, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5568 + }, + { + "epoch": 4.448083067092652, + "grad_norm": 0.30125850439071655, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5569 + }, + { + "epoch": 4.44888178913738, + "grad_norm": 0.1291274130344391, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5570 + }, + { + "epoch": 4.449680511182109, + "grad_norm": 0.08679793030023575, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5571 + }, + { + "epoch": 4.4504792332268375, + "grad_norm": 0.11555953323841095, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5572 + }, + { + "epoch": 4.451277955271565, + "grad_norm": 0.10711846500635147, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5573 + }, + { + "epoch": 4.452076677316294, + "grad_norm": 0.0604897104203701, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5574 + }, + { + "epoch": 4.452875399361022, + "grad_norm": 0.08729933202266693, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 5575 + }, + { + "epoch": 4.453674121405751, + "grad_norm": 0.09586715698242188, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5576 + }, + { + "epoch": 4.454472843450479, + "grad_norm": 0.11635993421077728, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5577 + }, + { + "epoch": 4.455271565495208, + "grad_norm": 0.12405801564455032, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5578 + }, + { + "epoch": 4.456070287539936, + "grad_norm": 0.1284986287355423, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 5579 + }, + { + "epoch": 4.456869009584665, + "grad_norm": 0.09059973061084747, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 5580 + }, + { + "epoch": 4.457667731629393, + "grad_norm": 0.08497101068496704, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5581 + }, + { + "epoch": 4.458466453674122, + "grad_norm": 0.10315481573343277, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5582 + }, + { + "epoch": 4.4592651757188495, + "grad_norm": 0.09923984855413437, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5583 + }, + { + "epoch": 4.460063897763578, + "grad_norm": 0.09179794788360596, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5584 + }, + { + "epoch": 4.460862619808307, + "grad_norm": 0.0783005952835083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5585 + }, + { + "epoch": 4.461661341853035, + "grad_norm": 0.4005993604660034, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5586 + }, + { + "epoch": 4.462460063897764, + "grad_norm": 0.09382215887308121, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5587 + }, + { + "epoch": 4.463258785942492, + "grad_norm": 0.10208452492952347, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 5588 + }, + { + "epoch": 4.464057507987221, + "grad_norm": 0.08237040042877197, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 5589 + }, + { + "epoch": 4.464856230031949, + "grad_norm": 0.07287969440221786, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 5590 + }, + { + "epoch": 4.465654952076678, + "grad_norm": 0.07156763970851898, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 5591 + }, + { + "epoch": 4.466453674121405, + "grad_norm": 0.11347219347953796, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5592 + }, + { + "epoch": 4.467252396166134, + "grad_norm": 0.13722039759159088, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 5593 + }, + { + "epoch": 4.468051118210862, + "grad_norm": 0.20186153054237366, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5594 + }, + { + "epoch": 4.468849840255591, + "grad_norm": 0.1548159420490265, + "learning_rate": 0.0005, + "loss": 1.0444, + "step": 5595 + }, + { + "epoch": 4.4696485623003195, + "grad_norm": 0.08960088342428207, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 5596 + }, + { + "epoch": 4.470447284345048, + "grad_norm": 0.23552097380161285, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5597 + }, + { + "epoch": 4.4712460063897765, + "grad_norm": 0.34478914737701416, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 5598 + }, + { + "epoch": 4.472044728434505, + "grad_norm": 0.219953253865242, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5599 + }, + { + "epoch": 4.472843450479234, + "grad_norm": 0.13104191422462463, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5600 + }, + { + "epoch": 4.473642172523961, + "grad_norm": 0.2867056131362915, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5601 + }, + { + "epoch": 4.47444089456869, + "grad_norm": 0.15794725716114044, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 5602 + }, + { + "epoch": 4.475239616613418, + "grad_norm": 0.10884165018796921, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5603 + }, + { + "epoch": 4.476038338658147, + "grad_norm": 1.0521267652511597, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 5604 + }, + { + "epoch": 4.476837060702875, + "grad_norm": 0.07823536545038223, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 5605 + }, + { + "epoch": 4.477635782747604, + "grad_norm": 0.1536101996898651, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5606 + }, + { + "epoch": 4.478434504792332, + "grad_norm": 0.1379251778125763, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 5607 + }, + { + "epoch": 4.479233226837061, + "grad_norm": 0.06181122735142708, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5608 + }, + { + "epoch": 4.4800319488817895, + "grad_norm": 0.1701904535293579, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 5609 + }, + { + "epoch": 4.480830670926517, + "grad_norm": 0.1322227120399475, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5610 + }, + { + "epoch": 4.481629392971246, + "grad_norm": 0.09158491343259811, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5611 + }, + { + "epoch": 4.482428115015974, + "grad_norm": 0.09851136803627014, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5612 + }, + { + "epoch": 4.483226837060703, + "grad_norm": 0.09350419789552689, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5613 + }, + { + "epoch": 4.484025559105431, + "grad_norm": 0.40614885091781616, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5614 + }, + { + "epoch": 4.48482428115016, + "grad_norm": 0.1653166264295578, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 5615 + }, + { + "epoch": 4.485623003194888, + "grad_norm": 0.13429352641105652, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 5616 + }, + { + "epoch": 4.486421725239617, + "grad_norm": 0.09340473264455795, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 5617 + }, + { + "epoch": 4.487220447284345, + "grad_norm": 0.1621188223361969, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5618 + }, + { + "epoch": 4.488019169329074, + "grad_norm": 0.18538816273212433, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5619 + }, + { + "epoch": 4.488817891373802, + "grad_norm": 0.26981350779533386, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5620 + }, + { + "epoch": 4.48961661341853, + "grad_norm": 0.28865110874176025, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5621 + }, + { + "epoch": 4.4904153354632586, + "grad_norm": 0.23013874888420105, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5622 + }, + { + "epoch": 4.491214057507987, + "grad_norm": 0.08305853605270386, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5623 + }, + { + "epoch": 4.492012779552716, + "grad_norm": 0.1810445487499237, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5624 + }, + { + "epoch": 4.492811501597444, + "grad_norm": 0.23000332713127136, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 5625 + }, + { + "epoch": 4.493610223642173, + "grad_norm": 0.06753652542829514, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5626 + }, + { + "epoch": 4.494408945686901, + "grad_norm": 0.19956068694591522, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5627 + }, + { + "epoch": 4.49520766773163, + "grad_norm": 0.24572248756885529, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5628 + }, + { + "epoch": 4.496006389776358, + "grad_norm": 0.06617605686187744, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5629 + }, + { + "epoch": 4.496805111821086, + "grad_norm": 0.18551495671272278, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5630 + }, + { + "epoch": 4.497603833865814, + "grad_norm": 0.16827648878097534, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 5631 + }, + { + "epoch": 4.498402555910543, + "grad_norm": 0.13273993134498596, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5632 + }, + { + "epoch": 4.4992012779552715, + "grad_norm": 0.24461479485034943, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5633 + }, + { + "epoch": 4.5, + "grad_norm": 0.2016836553812027, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5634 + }, + { + "epoch": 4.5007987220447285, + "grad_norm": 0.07513006776571274, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 5635 + }, + { + "epoch": 4.501597444089457, + "grad_norm": 0.1701919138431549, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 5636 + }, + { + "epoch": 4.502396166134186, + "grad_norm": 0.12785466015338898, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5637 + }, + { + "epoch": 4.503194888178914, + "grad_norm": 0.1135641485452652, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5638 + }, + { + "epoch": 4.503993610223642, + "grad_norm": 0.5004979372024536, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5639 + }, + { + "epoch": 4.50479233226837, + "grad_norm": 0.28730812668800354, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5640 + }, + { + "epoch": 4.505591054313099, + "grad_norm": 0.3666481673717499, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 5641 + }, + { + "epoch": 4.506389776357827, + "grad_norm": 0.257710337638855, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 5642 + }, + { + "epoch": 4.507188498402556, + "grad_norm": 0.20071941614151, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5643 + }, + { + "epoch": 4.507987220447284, + "grad_norm": 0.3445729613304138, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5644 + }, + { + "epoch": 4.508785942492013, + "grad_norm": 0.20297282934188843, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5645 + }, + { + "epoch": 4.5095846645367414, + "grad_norm": 0.1889636069536209, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5646 + }, + { + "epoch": 4.51038338658147, + "grad_norm": 0.2153794765472412, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5647 + }, + { + "epoch": 4.511182108626198, + "grad_norm": 0.15353621542453766, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5648 + }, + { + "epoch": 4.511980830670926, + "grad_norm": 0.1575399786233902, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5649 + }, + { + "epoch": 4.512779552715655, + "grad_norm": 0.5555608868598938, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5650 + }, + { + "epoch": 4.513578274760383, + "grad_norm": 0.26887524127960205, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 5651 + }, + { + "epoch": 4.514376996805112, + "grad_norm": 0.11516866087913513, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5652 + }, + { + "epoch": 4.51517571884984, + "grad_norm": 0.19820965826511383, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5653 + }, + { + "epoch": 4.515974440894569, + "grad_norm": 0.2122081071138382, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 5654 + }, + { + "epoch": 4.516773162939297, + "grad_norm": 0.10736703872680664, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5655 + }, + { + "epoch": 4.517571884984026, + "grad_norm": 0.09852312505245209, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 5656 + }, + { + "epoch": 4.518370607028754, + "grad_norm": 0.07539162784814835, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5657 + }, + { + "epoch": 4.519169329073483, + "grad_norm": 0.07467353343963623, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 5658 + }, + { + "epoch": 4.5199680511182105, + "grad_norm": 0.09987884759902954, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 5659 + }, + { + "epoch": 4.520766773162939, + "grad_norm": 0.08720221370458603, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 5660 + }, + { + "epoch": 4.521565495207668, + "grad_norm": 0.07798969000577927, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 5661 + }, + { + "epoch": 4.522364217252396, + "grad_norm": 0.12410122901201248, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5662 + }, + { + "epoch": 4.523162939297125, + "grad_norm": 0.07746852934360504, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5663 + }, + { + "epoch": 4.523961661341853, + "grad_norm": 0.09171058982610703, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 5664 + }, + { + "epoch": 4.524760383386582, + "grad_norm": 0.8176944255828857, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5665 + }, + { + "epoch": 4.52555910543131, + "grad_norm": 0.4282614290714264, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5666 + }, + { + "epoch": 4.526357827476039, + "grad_norm": 0.35193827748298645, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5667 + }, + { + "epoch": 4.527156549520766, + "grad_norm": 0.15641339123249054, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5668 + }, + { + "epoch": 4.527955271565495, + "grad_norm": 0.31442952156066895, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5669 + }, + { + "epoch": 4.5287539936102235, + "grad_norm": 0.3205500841140747, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5670 + }, + { + "epoch": 4.529552715654952, + "grad_norm": 0.2866390645503998, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5671 + }, + { + "epoch": 4.5303514376996805, + "grad_norm": 0.21028868854045868, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 5672 + }, + { + "epoch": 4.531150159744409, + "grad_norm": 0.32687097787857056, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5673 + }, + { + "epoch": 4.531948881789138, + "grad_norm": 0.25662627816200256, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5674 + }, + { + "epoch": 4.532747603833866, + "grad_norm": 0.10192561894655228, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5675 + }, + { + "epoch": 4.533546325878595, + "grad_norm": 0.8102573752403259, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5676 + }, + { + "epoch": 4.534345047923322, + "grad_norm": 0.19127781689167023, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5677 + }, + { + "epoch": 4.535143769968051, + "grad_norm": 0.22435548901557922, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5678 + }, + { + "epoch": 4.535942492012779, + "grad_norm": 0.3271692395210266, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5679 + }, + { + "epoch": 4.536741214057508, + "grad_norm": 0.17226184904575348, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 5680 + }, + { + "epoch": 4.537539936102236, + "grad_norm": 0.16628077626228333, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 5681 + }, + { + "epoch": 4.538338658146965, + "grad_norm": 0.6196639537811279, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 5682 + }, + { + "epoch": 4.539137380191693, + "grad_norm": 0.21590936183929443, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5683 + }, + { + "epoch": 4.539936102236422, + "grad_norm": 0.16313950717449188, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5684 + }, + { + "epoch": 4.5407348242811505, + "grad_norm": 0.12859022617340088, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5685 + }, + { + "epoch": 4.541533546325878, + "grad_norm": 0.1189458817243576, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 5686 + }, + { + "epoch": 4.542332268370607, + "grad_norm": 6.769774913787842, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5687 + }, + { + "epoch": 4.543130990415335, + "grad_norm": 0.20253166556358337, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5688 + }, + { + "epoch": 4.543929712460064, + "grad_norm": 0.11631135642528534, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 5689 + }, + { + "epoch": 4.544728434504792, + "grad_norm": 0.1848360300064087, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5690 + }, + { + "epoch": 4.545527156549521, + "grad_norm": 0.17804184556007385, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 5691 + }, + { + "epoch": 4.546325878594249, + "grad_norm": 0.2214183509349823, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5692 + }, + { + "epoch": 4.547124600638978, + "grad_norm": 16.448396682739258, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5693 + }, + { + "epoch": 4.547923322683706, + "grad_norm": 0.4933917224407196, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 5694 + }, + { + "epoch": 4.548722044728435, + "grad_norm": 0.41254448890686035, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 5695 + }, + { + "epoch": 4.549520766773163, + "grad_norm": 0.28898510336875916, + "learning_rate": 0.0005, + "loss": 1.0785, + "step": 5696 + }, + { + "epoch": 4.550319488817891, + "grad_norm": 0.2938457727432251, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 5697 + }, + { + "epoch": 4.55111821086262, + "grad_norm": 0.2264672964811325, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5698 + }, + { + "epoch": 4.551916932907348, + "grad_norm": 0.12931588292121887, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5699 + }, + { + "epoch": 4.552715654952077, + "grad_norm": 0.22106601297855377, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 5700 + }, + { + "epoch": 4.553514376996805, + "grad_norm": 0.31875962018966675, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5701 + }, + { + "epoch": 4.554313099041534, + "grad_norm": 0.3129211962223053, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5702 + }, + { + "epoch": 4.555111821086262, + "grad_norm": 0.1613578200340271, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5703 + }, + { + "epoch": 4.555910543130991, + "grad_norm": 0.6340786814689636, + "learning_rate": 0.0005, + "loss": 1.0737, + "step": 5704 + }, + { + "epoch": 4.556709265175719, + "grad_norm": 0.13203595578670502, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5705 + }, + { + "epoch": 4.557507987220447, + "grad_norm": 0.16561077535152435, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5706 + }, + { + "epoch": 4.5583067092651754, + "grad_norm": 0.17777414619922638, + "learning_rate": 0.0005, + "loss": 1.073, + "step": 5707 + }, + { + "epoch": 4.559105431309904, + "grad_norm": 0.6985258460044861, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 5708 + }, + { + "epoch": 4.5599041533546325, + "grad_norm": 0.18673790991306305, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 5709 + }, + { + "epoch": 4.560702875399361, + "grad_norm": 0.10636870563030243, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5710 + }, + { + "epoch": 4.56150159744409, + "grad_norm": 0.1719052493572235, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5711 + }, + { + "epoch": 4.562300319488818, + "grad_norm": 0.7030455470085144, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5712 + }, + { + "epoch": 4.563099041533547, + "grad_norm": 0.1482628434896469, + "learning_rate": 0.0005, + "loss": 1.0776, + "step": 5713 + }, + { + "epoch": 4.563897763578275, + "grad_norm": 0.1585852950811386, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5714 + }, + { + "epoch": 4.564696485623003, + "grad_norm": 0.16067056357860565, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5715 + }, + { + "epoch": 4.565495207667731, + "grad_norm": 0.16162389516830444, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5716 + }, + { + "epoch": 4.56629392971246, + "grad_norm": 0.07224202156066895, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5717 + }, + { + "epoch": 4.567092651757188, + "grad_norm": 0.2577751576900482, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5718 + }, + { + "epoch": 4.567891373801917, + "grad_norm": 1.676942229270935, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 5719 + }, + { + "epoch": 4.568690095846645, + "grad_norm": 0.11058419197797775, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5720 + }, + { + "epoch": 4.569488817891374, + "grad_norm": 0.23155376315116882, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 5721 + }, + { + "epoch": 4.5702875399361025, + "grad_norm": 0.1197747215628624, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 5722 + }, + { + "epoch": 4.571086261980831, + "grad_norm": 0.5179840326309204, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 5723 + }, + { + "epoch": 4.571884984025559, + "grad_norm": 0.17717961966991425, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 5724 + }, + { + "epoch": 4.572683706070287, + "grad_norm": 0.1513422429561615, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5725 + }, + { + "epoch": 4.573482428115016, + "grad_norm": 0.15495018661022186, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5726 + }, + { + "epoch": 4.574281150159744, + "grad_norm": 3.4248743057250977, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5727 + }, + { + "epoch": 4.575079872204473, + "grad_norm": 0.29529228806495667, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5728 + }, + { + "epoch": 4.575878594249201, + "grad_norm": 0.21125876903533936, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 5729 + }, + { + "epoch": 4.57667731629393, + "grad_norm": 0.16381484270095825, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5730 + }, + { + "epoch": 4.577476038338658, + "grad_norm": 0.2144167572259903, + "learning_rate": 0.0005, + "loss": 1.0824, + "step": 5731 + }, + { + "epoch": 4.578274760383387, + "grad_norm": 0.1564428210258484, + "learning_rate": 0.0005, + "loss": 1.0746, + "step": 5732 + }, + { + "epoch": 4.5790734824281145, + "grad_norm": 0.21137529611587524, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5733 + }, + { + "epoch": 4.579872204472844, + "grad_norm": 0.13836248219013214, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5734 + }, + { + "epoch": 4.580670926517572, + "grad_norm": 0.11749537289142609, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 5735 + }, + { + "epoch": 4.5814696485623, + "grad_norm": 0.10901704430580139, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5736 + }, + { + "epoch": 4.582268370607029, + "grad_norm": 0.08402425795793533, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5737 + }, + { + "epoch": 4.583067092651757, + "grad_norm": 0.1502164900302887, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5738 + }, + { + "epoch": 4.583865814696486, + "grad_norm": 0.10606876760721207, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5739 + }, + { + "epoch": 4.584664536741214, + "grad_norm": 0.11868279427289963, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5740 + }, + { + "epoch": 4.585463258785943, + "grad_norm": 0.10678767412900925, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5741 + }, + { + "epoch": 4.586261980830671, + "grad_norm": 0.28886285424232483, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5742 + }, + { + "epoch": 4.5870607028754, + "grad_norm": 0.3516097366809845, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5743 + }, + { + "epoch": 4.587859424920127, + "grad_norm": 0.10221854597330093, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 5744 + }, + { + "epoch": 4.588658146964856, + "grad_norm": 0.24786177277565002, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5745 + }, + { + "epoch": 4.5894568690095845, + "grad_norm": 0.10537181794643402, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5746 + }, + { + "epoch": 4.590255591054313, + "grad_norm": 0.23574885725975037, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 5747 + }, + { + "epoch": 4.5910543130990416, + "grad_norm": 0.1483563631772995, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 5748 + }, + { + "epoch": 4.59185303514377, + "grad_norm": 0.1516815721988678, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 5749 + }, + { + "epoch": 4.592651757188499, + "grad_norm": 0.09670868515968323, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5750 + }, + { + "epoch": 4.593450479233227, + "grad_norm": 0.10706239938735962, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 5751 + }, + { + "epoch": 4.594249201277956, + "grad_norm": 1.081868290901184, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5752 + }, + { + "epoch": 4.595047923322683, + "grad_norm": 0.4016919732093811, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5753 + }, + { + "epoch": 4.595846645367412, + "grad_norm": 0.3266371786594391, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5754 + }, + { + "epoch": 4.59664536741214, + "grad_norm": 0.23380769789218903, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5755 + }, + { + "epoch": 4.597444089456869, + "grad_norm": 0.2521349787712097, + "learning_rate": 0.0005, + "loss": 1.0841, + "step": 5756 + }, + { + "epoch": 4.598242811501597, + "grad_norm": 0.2223331481218338, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 5757 + }, + { + "epoch": 4.599041533546326, + "grad_norm": 0.177442729473114, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 5758 + }, + { + "epoch": 4.5998402555910545, + "grad_norm": 0.18474844098091125, + "learning_rate": 0.0005, + "loss": 1.0752, + "step": 5759 + }, + { + "epoch": 4.600638977635783, + "grad_norm": 0.1686495542526245, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 5760 + }, + { + "epoch": 4.6014376996805115, + "grad_norm": 0.13674414157867432, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 5761 + }, + { + "epoch": 4.602236421725239, + "grad_norm": 0.1390203833580017, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 5762 + }, + { + "epoch": 4.603035143769968, + "grad_norm": 0.10701096057891846, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5763 + }, + { + "epoch": 4.603833865814696, + "grad_norm": 0.110149085521698, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5764 + }, + { + "epoch": 4.604632587859425, + "grad_norm": 0.2477579116821289, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 5765 + }, + { + "epoch": 4.605431309904153, + "grad_norm": 0.2554718852043152, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 5766 + }, + { + "epoch": 4.606230031948882, + "grad_norm": 0.1945963203907013, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5767 + }, + { + "epoch": 4.60702875399361, + "grad_norm": 0.26785531640052795, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 5768 + }, + { + "epoch": 4.607827476038339, + "grad_norm": 0.3007332980632782, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5769 + }, + { + "epoch": 4.608626198083067, + "grad_norm": 0.09973788261413574, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5770 + }, + { + "epoch": 4.609424920127795, + "grad_norm": 0.09176181256771088, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5771 + }, + { + "epoch": 4.6102236421725244, + "grad_norm": 0.1395607590675354, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 5772 + }, + { + "epoch": 4.611022364217252, + "grad_norm": 0.8938566446304321, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5773 + }, + { + "epoch": 4.611821086261981, + "grad_norm": 0.3093889653682709, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5774 + }, + { + "epoch": 4.612619808306709, + "grad_norm": 0.1910911351442337, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 5775 + }, + { + "epoch": 4.613418530351438, + "grad_norm": 0.11586496978998184, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 5776 + }, + { + "epoch": 4.614217252396166, + "grad_norm": 0.222470223903656, + "learning_rate": 0.0005, + "loss": 1.0786, + "step": 5777 + }, + { + "epoch": 4.615015974440895, + "grad_norm": 0.16580955684185028, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 5778 + }, + { + "epoch": 4.615814696485623, + "grad_norm": 0.11279458552598953, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5779 + }, + { + "epoch": 4.616613418530352, + "grad_norm": 0.10970400273799896, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5780 + }, + { + "epoch": 4.61741214057508, + "grad_norm": 0.11291752755641937, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5781 + }, + { + "epoch": 4.618210862619808, + "grad_norm": 0.19262762367725372, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5782 + }, + { + "epoch": 4.6190095846645365, + "grad_norm": 0.12736102938652039, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 5783 + }, + { + "epoch": 4.619808306709265, + "grad_norm": 0.09300720691680908, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5784 + }, + { + "epoch": 4.6206070287539935, + "grad_norm": 0.09544654190540314, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5785 + }, + { + "epoch": 4.621405750798722, + "grad_norm": 0.2888239026069641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5786 + }, + { + "epoch": 4.622204472843451, + "grad_norm": 0.22988484799861908, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5787 + }, + { + "epoch": 4.623003194888179, + "grad_norm": 0.2574143707752228, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 5788 + }, + { + "epoch": 4.623801916932908, + "grad_norm": 0.2503221333026886, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5789 + }, + { + "epoch": 4.624600638977636, + "grad_norm": 0.20846052467823029, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5790 + }, + { + "epoch": 4.625399361022364, + "grad_norm": 0.218403160572052, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5791 + }, + { + "epoch": 4.626198083067092, + "grad_norm": 0.11333920061588287, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5792 + }, + { + "epoch": 4.626996805111821, + "grad_norm": 0.19022895395755768, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5793 + }, + { + "epoch": 4.627795527156549, + "grad_norm": 0.1525644063949585, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 5794 + }, + { + "epoch": 4.628594249201278, + "grad_norm": 0.07636452466249466, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5795 + }, + { + "epoch": 4.6293929712460065, + "grad_norm": 0.1358552873134613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5796 + }, + { + "epoch": 4.630191693290735, + "grad_norm": 0.08993138372898102, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5797 + }, + { + "epoch": 4.6309904153354635, + "grad_norm": 0.15454545617103577, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5798 + }, + { + "epoch": 4.631789137380192, + "grad_norm": 0.12256992608308792, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5799 + }, + { + "epoch": 4.63258785942492, + "grad_norm": 0.08453187346458435, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5800 + }, + { + "epoch": 4.633386581469648, + "grad_norm": 0.1474936157464981, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5801 + }, + { + "epoch": 4.634185303514377, + "grad_norm": 0.11481066793203354, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 5802 + }, + { + "epoch": 4.634984025559105, + "grad_norm": 0.41141587495803833, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5803 + }, + { + "epoch": 4.635782747603834, + "grad_norm": 0.1509549766778946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 5804 + }, + { + "epoch": 4.636581469648562, + "grad_norm": 0.13562771677970886, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5805 + }, + { + "epoch": 4.637380191693291, + "grad_norm": 0.09722459316253662, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5806 + }, + { + "epoch": 4.638178913738019, + "grad_norm": 0.3194493353366852, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 5807 + }, + { + "epoch": 4.638977635782748, + "grad_norm": 0.23091651499271393, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5808 + }, + { + "epoch": 4.6397763578274756, + "grad_norm": 0.1682155877351761, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5809 + }, + { + "epoch": 4.640575079872205, + "grad_norm": 0.37293288111686707, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 5810 + }, + { + "epoch": 4.641373801916933, + "grad_norm": 0.3746488094329834, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5811 + }, + { + "epoch": 4.642172523961661, + "grad_norm": 0.2068052738904953, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5812 + }, + { + "epoch": 4.64297124600639, + "grad_norm": 0.13229581713676453, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5813 + }, + { + "epoch": 4.643769968051118, + "grad_norm": 0.24158459901809692, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5814 + }, + { + "epoch": 4.644568690095847, + "grad_norm": 0.4241867959499359, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5815 + }, + { + "epoch": 4.645367412140575, + "grad_norm": 0.40008923411369324, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5816 + }, + { + "epoch": 4.646166134185304, + "grad_norm": 0.3150584101676941, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 5817 + }, + { + "epoch": 4.646964856230032, + "grad_norm": 0.11021434515714645, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 5818 + }, + { + "epoch": 4.647763578274761, + "grad_norm": 0.30061402916908264, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5819 + }, + { + "epoch": 4.6485623003194885, + "grad_norm": 0.12583592534065247, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5820 + }, + { + "epoch": 4.649361022364217, + "grad_norm": 0.31917983293533325, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5821 + }, + { + "epoch": 4.6501597444089455, + "grad_norm": 0.2097153663635254, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5822 + }, + { + "epoch": 4.650958466453674, + "grad_norm": 0.19847621023654938, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5823 + }, + { + "epoch": 4.651757188498403, + "grad_norm": 0.2482050508260727, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5824 + }, + { + "epoch": 4.652555910543131, + "grad_norm": 0.1257491409778595, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 5825 + }, + { + "epoch": 4.65335463258786, + "grad_norm": 0.2192201465368271, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5826 + }, + { + "epoch": 4.654153354632588, + "grad_norm": 0.16453656554222107, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 5827 + }, + { + "epoch": 4.654952076677317, + "grad_norm": 0.18813923001289368, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 5828 + }, + { + "epoch": 4.655750798722044, + "grad_norm": 0.1811141073703766, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5829 + }, + { + "epoch": 4.656549520766773, + "grad_norm": 0.08911352604627609, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5830 + }, + { + "epoch": 4.657348242811501, + "grad_norm": 0.17858019471168518, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5831 + }, + { + "epoch": 4.65814696485623, + "grad_norm": 0.27315759658813477, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5832 + }, + { + "epoch": 4.6589456869009584, + "grad_norm": 0.18612337112426758, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5833 + }, + { + "epoch": 4.659744408945687, + "grad_norm": 0.2646125257015228, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 5834 + }, + { + "epoch": 4.6605431309904155, + "grad_norm": 0.07320903241634369, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5835 + }, + { + "epoch": 4.661341853035144, + "grad_norm": 0.12969297170639038, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 5836 + }, + { + "epoch": 4.662140575079873, + "grad_norm": 0.37665078043937683, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5837 + }, + { + "epoch": 4.6629392971246, + "grad_norm": 0.11055029928684235, + "learning_rate": 0.0005, + "loss": 1.0735, + "step": 5838 + }, + { + "epoch": 4.663738019169329, + "grad_norm": 0.12279482185840607, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5839 + }, + { + "epoch": 4.664536741214057, + "grad_norm": 0.0686316192150116, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5840 + }, + { + "epoch": 4.665335463258786, + "grad_norm": 0.09705425798892975, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 5841 + }, + { + "epoch": 4.666134185303514, + "grad_norm": 0.09543570131063461, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 5842 + }, + { + "epoch": 4.666932907348243, + "grad_norm": 0.08460460603237152, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 5843 + }, + { + "epoch": 4.667731629392971, + "grad_norm": 0.12419378757476807, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5844 + }, + { + "epoch": 4.6685303514377, + "grad_norm": 0.09184019267559052, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 5845 + }, + { + "epoch": 4.669329073482428, + "grad_norm": 0.09425100684165955, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 5846 + }, + { + "epoch": 4.670127795527156, + "grad_norm": 0.19701971113681793, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5847 + }, + { + "epoch": 4.6709265175718855, + "grad_norm": 0.0648239254951477, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5848 + }, + { + "epoch": 4.671725239616613, + "grad_norm": 0.11558888107538223, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 5849 + }, + { + "epoch": 4.672523961661342, + "grad_norm": 0.12397976219654083, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 5850 + }, + { + "epoch": 4.67332268370607, + "grad_norm": 0.10640132427215576, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5851 + }, + { + "epoch": 4.674121405750799, + "grad_norm": 0.08930578827857971, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 5852 + }, + { + "epoch": 4.674920127795527, + "grad_norm": 0.06212310120463371, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5853 + }, + { + "epoch": 4.675718849840256, + "grad_norm": 0.08568188548088074, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 5854 + }, + { + "epoch": 4.676517571884984, + "grad_norm": 0.11431021988391876, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 5855 + }, + { + "epoch": 4.677316293929713, + "grad_norm": 0.34381258487701416, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 5856 + }, + { + "epoch": 4.678115015974441, + "grad_norm": 0.1996181309223175, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5857 + }, + { + "epoch": 4.678913738019169, + "grad_norm": 0.2900290787220001, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 5858 + }, + { + "epoch": 4.6797124600638975, + "grad_norm": 0.35768410563468933, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5859 + }, + { + "epoch": 4.680511182108626, + "grad_norm": 0.1027536615729332, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5860 + }, + { + "epoch": 4.681309904153355, + "grad_norm": 0.6286419630050659, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 5861 + }, + { + "epoch": 4.682108626198083, + "grad_norm": 0.5037242770195007, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 5862 + }, + { + "epoch": 4.682907348242812, + "grad_norm": 0.34654417634010315, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 5863 + }, + { + "epoch": 4.68370607028754, + "grad_norm": 0.18139366805553436, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5864 + }, + { + "epoch": 4.684504792332269, + "grad_norm": 0.2101605087518692, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 5865 + }, + { + "epoch": 4.685303514376997, + "grad_norm": 0.0922360047698021, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 5866 + }, + { + "epoch": 4.686102236421725, + "grad_norm": 0.23476624488830566, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5867 + }, + { + "epoch": 4.686900958466453, + "grad_norm": 0.1843792051076889, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 5868 + }, + { + "epoch": 4.687699680511182, + "grad_norm": 0.09449298679828644, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 5869 + }, + { + "epoch": 4.68849840255591, + "grad_norm": 0.13996686041355133, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 5870 + }, + { + "epoch": 4.689297124600639, + "grad_norm": 2.113325357437134, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 5871 + }, + { + "epoch": 4.6900958466453675, + "grad_norm": 0.35181209444999695, + "learning_rate": 0.0005, + "loss": 1.0846, + "step": 5872 + }, + { + "epoch": 4.690894568690096, + "grad_norm": 0.3530768156051636, + "learning_rate": 0.0005, + "loss": 1.0753, + "step": 5873 + }, + { + "epoch": 4.6916932907348246, + "grad_norm": 0.25919783115386963, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 5874 + }, + { + "epoch": 4.692492012779553, + "grad_norm": 0.19770720601081848, + "learning_rate": 0.0005, + "loss": 1.0751, + "step": 5875 + }, + { + "epoch": 4.693290734824281, + "grad_norm": 0.32085585594177246, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 5876 + }, + { + "epoch": 4.694089456869009, + "grad_norm": 0.14215363562107086, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 5877 + }, + { + "epoch": 4.694888178913738, + "grad_norm": 0.24502497911453247, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 5878 + }, + { + "epoch": 4.695686900958466, + "grad_norm": 0.15765784680843353, + "learning_rate": 0.0005, + "loss": 1.0721, + "step": 5879 + }, + { + "epoch": 4.696485623003195, + "grad_norm": 0.13945002853870392, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 5880 + }, + { + "epoch": 4.697284345047923, + "grad_norm": 0.16315795481204987, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 5881 + }, + { + "epoch": 4.698083067092652, + "grad_norm": 0.0803297907114029, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5882 + }, + { + "epoch": 4.69888178913738, + "grad_norm": 0.09848042577505112, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 5883 + }, + { + "epoch": 4.699680511182109, + "grad_norm": 0.22370465099811554, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 5884 + }, + { + "epoch": 4.700479233226837, + "grad_norm": 0.09369395673274994, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5885 + }, + { + "epoch": 4.701277955271565, + "grad_norm": 0.42340102791786194, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 5886 + }, + { + "epoch": 4.702076677316294, + "grad_norm": 0.08471440523862839, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 5887 + }, + { + "epoch": 4.702875399361022, + "grad_norm": 0.11350758373737335, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 5888 + }, + { + "epoch": 4.703674121405751, + "grad_norm": 0.16862216591835022, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 5889 + }, + { + "epoch": 4.704472843450479, + "grad_norm": 0.17468953132629395, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5890 + }, + { + "epoch": 4.705271565495208, + "grad_norm": 0.09154370427131653, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 5891 + }, + { + "epoch": 4.706070287539936, + "grad_norm": 0.08715084940195084, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5892 + }, + { + "epoch": 4.706869009584665, + "grad_norm": 0.06797291338443756, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 5893 + }, + { + "epoch": 4.707667731629393, + "grad_norm": 0.17333610355854034, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5894 + }, + { + "epoch": 4.708466453674122, + "grad_norm": 0.17272767424583435, + "learning_rate": 0.0005, + "loss": 1.0727, + "step": 5895 + }, + { + "epoch": 4.7092651757188495, + "grad_norm": 0.11773357540369034, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5896 + }, + { + "epoch": 4.710063897763578, + "grad_norm": 0.08420758694410324, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 5897 + }, + { + "epoch": 4.710862619808307, + "grad_norm": 0.08672801405191422, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 5898 + }, + { + "epoch": 4.711661341853035, + "grad_norm": 0.2356635183095932, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 5899 + }, + { + "epoch": 4.712460063897764, + "grad_norm": 0.06091082841157913, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 5900 + }, + { + "epoch": 4.713258785942492, + "grad_norm": 0.09156842529773712, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 5901 + }, + { + "epoch": 4.714057507987221, + "grad_norm": 0.06548108160495758, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5902 + }, + { + "epoch": 4.714856230031949, + "grad_norm": 0.12813016772270203, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 5903 + }, + { + "epoch": 4.715654952076678, + "grad_norm": 0.1518833339214325, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 5904 + }, + { + "epoch": 4.716453674121405, + "grad_norm": 0.09331580996513367, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 5905 + }, + { + "epoch": 4.717252396166134, + "grad_norm": 0.11989843845367432, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5906 + }, + { + "epoch": 4.718051118210862, + "grad_norm": 0.1277054399251938, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5907 + }, + { + "epoch": 4.718849840255591, + "grad_norm": 0.11199159920215607, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5908 + }, + { + "epoch": 4.7196485623003195, + "grad_norm": 0.09120891988277435, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 5909 + }, + { + "epoch": 4.720447284345048, + "grad_norm": 0.11668230593204498, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 5910 + }, + { + "epoch": 4.7212460063897765, + "grad_norm": 0.08594206720590591, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 5911 + }, + { + "epoch": 4.722044728434505, + "grad_norm": 0.11563027650117874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5912 + }, + { + "epoch": 4.722843450479234, + "grad_norm": 0.15066663920879364, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 5913 + }, + { + "epoch": 4.723642172523961, + "grad_norm": 0.08566875755786896, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5914 + }, + { + "epoch": 4.72444089456869, + "grad_norm": 0.060813747346401215, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 5915 + }, + { + "epoch": 4.725239616613418, + "grad_norm": 0.07391642779111862, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 5916 + }, + { + "epoch": 4.726038338658147, + "grad_norm": 0.04867766425013542, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5917 + }, + { + "epoch": 4.726837060702875, + "grad_norm": 0.09468305110931396, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5918 + }, + { + "epoch": 4.727635782747604, + "grad_norm": 0.07287945598363876, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 5919 + }, + { + "epoch": 4.728434504792332, + "grad_norm": 0.08984806388616562, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5920 + }, + { + "epoch": 4.729233226837061, + "grad_norm": 0.1755092740058899, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 5921 + }, + { + "epoch": 4.7300319488817895, + "grad_norm": 0.09656399488449097, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 5922 + }, + { + "epoch": 4.730830670926517, + "grad_norm": 0.15759015083312988, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 5923 + }, + { + "epoch": 4.731629392971246, + "grad_norm": 0.13238383829593658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5924 + }, + { + "epoch": 4.732428115015974, + "grad_norm": 0.05352601036429405, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5925 + }, + { + "epoch": 4.733226837060703, + "grad_norm": 0.06253937631845474, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 5926 + }, + { + "epoch": 4.734025559105431, + "grad_norm": 0.057317376136779785, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 5927 + }, + { + "epoch": 4.73482428115016, + "grad_norm": 0.12154382467269897, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5928 + }, + { + "epoch": 4.735623003194888, + "grad_norm": 0.0547759085893631, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 5929 + }, + { + "epoch": 4.736421725239617, + "grad_norm": 0.07446085661649704, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 5930 + }, + { + "epoch": 4.737220447284345, + "grad_norm": 0.09809007495641708, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 5931 + }, + { + "epoch": 4.738019169329074, + "grad_norm": 0.12434732168912888, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 5932 + }, + { + "epoch": 4.738817891373802, + "grad_norm": 0.12192053347826004, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5933 + }, + { + "epoch": 4.73961661341853, + "grad_norm": 0.08006733655929565, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5934 + }, + { + "epoch": 4.7404153354632586, + "grad_norm": 0.14677436649799347, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 5935 + }, + { + "epoch": 4.741214057507987, + "grad_norm": 0.10133987665176392, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 5936 + }, + { + "epoch": 4.742012779552716, + "grad_norm": 0.10331577062606812, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 5937 + }, + { + "epoch": 4.742811501597444, + "grad_norm": 0.14596082270145416, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 5938 + }, + { + "epoch": 4.743610223642173, + "grad_norm": 0.15139590203762054, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 5939 + }, + { + "epoch": 4.744408945686901, + "grad_norm": 0.0935182124376297, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 5940 + }, + { + "epoch": 4.74520766773163, + "grad_norm": 0.1002865880727768, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5941 + }, + { + "epoch": 4.746006389776358, + "grad_norm": 0.0968283861875534, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 5942 + }, + { + "epoch": 4.746805111821086, + "grad_norm": 0.11680585891008377, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5943 + }, + { + "epoch": 4.747603833865814, + "grad_norm": 0.12163184583187103, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 5944 + }, + { + "epoch": 4.748402555910543, + "grad_norm": 0.07288502901792526, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5945 + }, + { + "epoch": 4.7492012779552715, + "grad_norm": 0.3335740566253662, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 5946 + }, + { + "epoch": 4.75, + "grad_norm": 0.15408654510974884, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 5947 + }, + { + "epoch": 4.7507987220447285, + "grad_norm": 0.09612353891134262, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5948 + }, + { + "epoch": 4.751597444089457, + "grad_norm": 0.10403789579868317, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5949 + }, + { + "epoch": 4.752396166134186, + "grad_norm": 0.13026492297649384, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 5950 + }, + { + "epoch": 4.753194888178914, + "grad_norm": 0.061955004930496216, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 5951 + }, + { + "epoch": 4.753993610223642, + "grad_norm": 0.08264514058828354, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5952 + }, + { + "epoch": 4.75479233226837, + "grad_norm": 0.1132993996143341, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5953 + }, + { + "epoch": 4.755591054313099, + "grad_norm": 0.09022228419780731, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 5954 + }, + { + "epoch": 4.756389776357827, + "grad_norm": 0.13192631304264069, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5955 + }, + { + "epoch": 4.757188498402556, + "grad_norm": 0.08400337398052216, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5956 + }, + { + "epoch": 4.757987220447284, + "grad_norm": 0.05070018023252487, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 5957 + }, + { + "epoch": 4.758785942492013, + "grad_norm": 0.09561482816934586, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 5958 + }, + { + "epoch": 4.7595846645367414, + "grad_norm": 0.07369764894247055, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5959 + }, + { + "epoch": 4.76038338658147, + "grad_norm": 0.07777421176433563, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5960 + }, + { + "epoch": 4.761182108626198, + "grad_norm": 0.11525892466306686, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 5961 + }, + { + "epoch": 4.761980830670926, + "grad_norm": 0.1788506656885147, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 5962 + }, + { + "epoch": 4.762779552715655, + "grad_norm": 0.10067635029554367, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 5963 + }, + { + "epoch": 4.763578274760383, + "grad_norm": 0.08447863161563873, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 5964 + }, + { + "epoch": 4.764376996805112, + "grad_norm": 0.06801758706569672, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 5965 + }, + { + "epoch": 4.76517571884984, + "grad_norm": 0.07363327592611313, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5966 + }, + { + "epoch": 4.765974440894569, + "grad_norm": 0.05584784597158432, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 5967 + }, + { + "epoch": 4.766773162939297, + "grad_norm": 0.10064459592103958, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 5968 + }, + { + "epoch": 4.767571884984026, + "grad_norm": 0.1176871508359909, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 5969 + }, + { + "epoch": 4.768370607028754, + "grad_norm": 0.17485690116882324, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 5970 + }, + { + "epoch": 4.769169329073483, + "grad_norm": 0.15753531455993652, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 5971 + }, + { + "epoch": 4.7699680511182105, + "grad_norm": 0.1669864058494568, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5972 + }, + { + "epoch": 4.770766773162939, + "grad_norm": 0.07706131786108017, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 5973 + }, + { + "epoch": 4.771565495207668, + "grad_norm": 0.3537883460521698, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 5974 + }, + { + "epoch": 4.772364217252396, + "grad_norm": 0.20092372596263885, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 5975 + }, + { + "epoch": 4.773162939297125, + "grad_norm": 0.06521142274141312, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 5976 + }, + { + "epoch": 4.773961661341853, + "grad_norm": 0.1203140988945961, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 5977 + }, + { + "epoch": 4.774760383386582, + "grad_norm": 0.09655500948429108, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 5978 + }, + { + "epoch": 4.77555910543131, + "grad_norm": 0.09220302104949951, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5979 + }, + { + "epoch": 4.776357827476039, + "grad_norm": 0.7336251735687256, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 5980 + }, + { + "epoch": 4.777156549520766, + "grad_norm": 0.21415477991104126, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 5981 + }, + { + "epoch": 4.777955271565495, + "grad_norm": 0.14869220554828644, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 5982 + }, + { + "epoch": 4.7787539936102235, + "grad_norm": 0.0779772400856018, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 5983 + }, + { + "epoch": 4.779552715654952, + "grad_norm": 0.14274317026138306, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 5984 + }, + { + "epoch": 4.7803514376996805, + "grad_norm": 0.11580413579940796, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 5985 + }, + { + "epoch": 4.781150159744409, + "grad_norm": 0.055023401975631714, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 5986 + }, + { + "epoch": 4.781948881789138, + "grad_norm": 0.11657343804836273, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 5987 + }, + { + "epoch": 4.782747603833866, + "grad_norm": 0.07336080819368362, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 5988 + }, + { + "epoch": 4.783546325878595, + "grad_norm": 0.06066504120826721, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 5989 + }, + { + "epoch": 4.784345047923322, + "grad_norm": 0.05784285068511963, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 5990 + }, + { + "epoch": 4.785143769968051, + "grad_norm": 0.06317969411611557, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 5991 + }, + { + "epoch": 4.785942492012779, + "grad_norm": 0.1001245379447937, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 5992 + }, + { + "epoch": 4.786741214057508, + "grad_norm": 0.0743420347571373, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 5993 + }, + { + "epoch": 4.787539936102236, + "grad_norm": 0.07082799077033997, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 5994 + }, + { + "epoch": 4.788338658146965, + "grad_norm": 0.11087984591722488, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 5995 + }, + { + "epoch": 4.789137380191693, + "grad_norm": 0.05923386290669441, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 5996 + }, + { + "epoch": 4.789936102236422, + "grad_norm": 0.1020246297121048, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 5997 + }, + { + "epoch": 4.7907348242811505, + "grad_norm": 0.11524185538291931, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 5998 + }, + { + "epoch": 4.791533546325878, + "grad_norm": 0.06959006190299988, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 5999 + }, + { + "epoch": 4.792332268370607, + "grad_norm": 0.19179846346378326, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6000 + }, + { + "epoch": 4.793130990415335, + "grad_norm": 0.17232562601566315, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6001 + }, + { + "epoch": 4.793929712460064, + "grad_norm": 0.7047739028930664, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6002 + }, + { + "epoch": 4.794728434504792, + "grad_norm": 0.09086379408836365, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6003 + }, + { + "epoch": 4.795527156549521, + "grad_norm": 0.17785955965518951, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6004 + }, + { + "epoch": 4.796325878594249, + "grad_norm": 0.09529274702072144, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6005 + }, + { + "epoch": 4.797124600638978, + "grad_norm": 0.08041567355394363, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6006 + }, + { + "epoch": 4.797923322683706, + "grad_norm": 0.13888375461101532, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6007 + }, + { + "epoch": 4.798722044728435, + "grad_norm": 0.08110564947128296, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6008 + }, + { + "epoch": 4.799520766773163, + "grad_norm": 0.07443006336688995, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6009 + }, + { + "epoch": 4.800319488817891, + "grad_norm": 0.08499104529619217, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6010 + }, + { + "epoch": 4.80111821086262, + "grad_norm": 0.0616084523499012, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6011 + }, + { + "epoch": 4.801916932907348, + "grad_norm": 0.10845918208360672, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6012 + }, + { + "epoch": 4.802715654952077, + "grad_norm": 0.057658810168504715, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6013 + }, + { + "epoch": 4.803514376996805, + "grad_norm": 0.07163018733263016, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6014 + }, + { + "epoch": 4.804313099041534, + "grad_norm": 0.07016896456480026, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6015 + }, + { + "epoch": 4.805111821086262, + "grad_norm": 0.08233597129583359, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6016 + }, + { + "epoch": 4.805910543130991, + "grad_norm": 0.05408332124352455, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6017 + }, + { + "epoch": 4.806709265175719, + "grad_norm": 0.0886560007929802, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6018 + }, + { + "epoch": 4.807507987220447, + "grad_norm": 0.17860093712806702, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6019 + }, + { + "epoch": 4.8083067092651754, + "grad_norm": 0.26264694333076477, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6020 + }, + { + "epoch": 4.809105431309904, + "grad_norm": 0.08523311465978622, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6021 + }, + { + "epoch": 4.8099041533546325, + "grad_norm": 0.09873831272125244, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6022 + }, + { + "epoch": 4.810702875399361, + "grad_norm": 0.16135412454605103, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6023 + }, + { + "epoch": 4.81150159744409, + "grad_norm": 0.08003875613212585, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6024 + }, + { + "epoch": 4.812300319488818, + "grad_norm": 0.09117014706134796, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6025 + }, + { + "epoch": 4.813099041533547, + "grad_norm": 0.2316243052482605, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6026 + }, + { + "epoch": 4.813897763578275, + "grad_norm": 0.16050362586975098, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6027 + }, + { + "epoch": 4.814696485623003, + "grad_norm": 0.13559919595718384, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6028 + }, + { + "epoch": 4.815495207667731, + "grad_norm": 0.08917123824357986, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6029 + }, + { + "epoch": 4.81629392971246, + "grad_norm": 0.11498702317476273, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6030 + }, + { + "epoch": 4.817092651757188, + "grad_norm": 0.14677700400352478, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6031 + }, + { + "epoch": 4.817891373801917, + "grad_norm": 0.08849102258682251, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6032 + }, + { + "epoch": 4.818690095846645, + "grad_norm": 4.0974507331848145, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6033 + }, + { + "epoch": 4.819488817891374, + "grad_norm": 0.24215161800384521, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6034 + }, + { + "epoch": 4.8202875399361025, + "grad_norm": 0.2679882049560547, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6035 + }, + { + "epoch": 4.821086261980831, + "grad_norm": 0.11113203316926956, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6036 + }, + { + "epoch": 4.821884984025559, + "grad_norm": 0.17725592851638794, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6037 + }, + { + "epoch": 4.822683706070287, + "grad_norm": 0.08446694165468216, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6038 + }, + { + "epoch": 4.823482428115016, + "grad_norm": 0.26757946610450745, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6039 + }, + { + "epoch": 4.824281150159744, + "grad_norm": 0.1900561898946762, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6040 + }, + { + "epoch": 4.825079872204473, + "grad_norm": 0.21993426978588104, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6041 + }, + { + "epoch": 4.825878594249201, + "grad_norm": 15.862943649291992, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6042 + }, + { + "epoch": 4.82667731629393, + "grad_norm": 0.793515145778656, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6043 + }, + { + "epoch": 4.827476038338658, + "grad_norm": 0.5607691407203674, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6044 + }, + { + "epoch": 4.828274760383387, + "grad_norm": 0.2853091359138489, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6045 + }, + { + "epoch": 4.8290734824281145, + "grad_norm": 0.3579944670200348, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6046 + }, + { + "epoch": 4.829872204472844, + "grad_norm": 0.26784929633140564, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6047 + }, + { + "epoch": 4.830670926517572, + "grad_norm": 0.2363428920507431, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6048 + }, + { + "epoch": 4.8314696485623, + "grad_norm": 0.2922425866127014, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6049 + }, + { + "epoch": 4.832268370607029, + "grad_norm": 0.2173125147819519, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6050 + }, + { + "epoch": 4.833067092651757, + "grad_norm": 0.23552696406841278, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6051 + }, + { + "epoch": 4.833865814696486, + "grad_norm": 1.2383053302764893, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6052 + }, + { + "epoch": 4.834664536741214, + "grad_norm": 0.3284873366355896, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6053 + }, + { + "epoch": 4.835463258785943, + "grad_norm": 0.15584628283977509, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6054 + }, + { + "epoch": 4.836261980830671, + "grad_norm": 0.3136327862739563, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6055 + }, + { + "epoch": 4.8370607028754, + "grad_norm": 0.19863441586494446, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6056 + }, + { + "epoch": 4.837859424920127, + "grad_norm": 0.273644357919693, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6057 + }, + { + "epoch": 4.838658146964856, + "grad_norm": 0.2560950517654419, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6058 + }, + { + "epoch": 4.8394568690095845, + "grad_norm": 0.2243220955133438, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6059 + }, + { + "epoch": 4.840255591054313, + "grad_norm": 0.16328522562980652, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6060 + }, + { + "epoch": 4.8410543130990416, + "grad_norm": 0.42267754673957825, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6061 + }, + { + "epoch": 4.84185303514377, + "grad_norm": 0.21733495593070984, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6062 + }, + { + "epoch": 4.842651757188499, + "grad_norm": 0.12917862832546234, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6063 + }, + { + "epoch": 4.843450479233227, + "grad_norm": 0.1829921007156372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6064 + }, + { + "epoch": 4.844249201277956, + "grad_norm": 0.08751819282770157, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6065 + }, + { + "epoch": 4.845047923322683, + "grad_norm": 0.16521455347537994, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6066 + }, + { + "epoch": 4.845846645367412, + "grad_norm": 0.4328543543815613, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6067 + }, + { + "epoch": 4.84664536741214, + "grad_norm": 0.2682073712348938, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6068 + }, + { + "epoch": 4.847444089456869, + "grad_norm": 0.15217293798923492, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6069 + }, + { + "epoch": 4.848242811501597, + "grad_norm": 0.12807190418243408, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 6070 + }, + { + "epoch": 4.849041533546326, + "grad_norm": 1.4503207206726074, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6071 + }, + { + "epoch": 4.8498402555910545, + "grad_norm": 0.5045278668403625, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6072 + }, + { + "epoch": 4.850638977635783, + "grad_norm": 0.1992882788181305, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6073 + }, + { + "epoch": 4.8514376996805115, + "grad_norm": 0.3178166151046753, + "learning_rate": 0.0005, + "loss": 1.0708, + "step": 6074 + }, + { + "epoch": 4.852236421725239, + "grad_norm": 0.1244354322552681, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6075 + }, + { + "epoch": 4.853035143769968, + "grad_norm": 0.2837885320186615, + "learning_rate": 0.0005, + "loss": 1.0763, + "step": 6076 + }, + { + "epoch": 4.853833865814696, + "grad_norm": 0.11910229921340942, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6077 + }, + { + "epoch": 4.854632587859425, + "grad_norm": 0.5774815678596497, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6078 + }, + { + "epoch": 4.855431309904153, + "grad_norm": 0.13028140366077423, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6079 + }, + { + "epoch": 4.856230031948882, + "grad_norm": 0.21022816002368927, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6080 + }, + { + "epoch": 4.85702875399361, + "grad_norm": 0.11758062243461609, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6081 + }, + { + "epoch": 4.857827476038339, + "grad_norm": 0.1321621984243393, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6082 + }, + { + "epoch": 4.858626198083067, + "grad_norm": 0.11481605470180511, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6083 + }, + { + "epoch": 4.859424920127795, + "grad_norm": 0.0976998507976532, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6084 + }, + { + "epoch": 4.8602236421725244, + "grad_norm": 0.7211679220199585, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6085 + }, + { + "epoch": 4.861022364217252, + "grad_norm": 0.1417546272277832, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6086 + }, + { + "epoch": 4.861821086261981, + "grad_norm": 0.13830699026584625, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6087 + }, + { + "epoch": 4.862619808306709, + "grad_norm": 0.24840030074119568, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 6088 + }, + { + "epoch": 4.863418530351438, + "grad_norm": 3.442054033279419, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6089 + }, + { + "epoch": 4.864217252396166, + "grad_norm": 0.21404840052127838, + "learning_rate": 0.0005, + "loss": 1.0699, + "step": 6090 + }, + { + "epoch": 4.865015974440895, + "grad_norm": 0.3657711148262024, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6091 + }, + { + "epoch": 4.865814696485623, + "grad_norm": 0.2189537137746811, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6092 + }, + { + "epoch": 4.866613418530352, + "grad_norm": 0.17866109311580658, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6093 + }, + { + "epoch": 4.86741214057508, + "grad_norm": 0.19208978116512299, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6094 + }, + { + "epoch": 4.868210862619808, + "grad_norm": 0.08330709487199783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6095 + }, + { + "epoch": 4.8690095846645365, + "grad_norm": 0.1194678544998169, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6096 + }, + { + "epoch": 4.869808306709265, + "grad_norm": 0.07852908223867416, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6097 + }, + { + "epoch": 4.8706070287539935, + "grad_norm": 0.09230814129114151, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 6098 + }, + { + "epoch": 4.871405750798722, + "grad_norm": 0.06775277107954025, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6099 + }, + { + "epoch": 4.872204472843451, + "grad_norm": 0.28747716546058655, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6100 + }, + { + "epoch": 4.873003194888179, + "grad_norm": 0.11956486105918884, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6101 + }, + { + "epoch": 4.873801916932908, + "grad_norm": 0.09843557327985764, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6102 + }, + { + "epoch": 4.874600638977636, + "grad_norm": 0.08408313244581223, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6103 + }, + { + "epoch": 4.875399361022364, + "grad_norm": 0.08230917155742645, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6104 + }, + { + "epoch": 4.876198083067092, + "grad_norm": 0.08927451819181442, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6105 + }, + { + "epoch": 4.876996805111821, + "grad_norm": 0.5961875319480896, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6106 + }, + { + "epoch": 4.877795527156549, + "grad_norm": 0.5851842164993286, + "learning_rate": 0.0005, + "loss": 1.0817, + "step": 6107 + }, + { + "epoch": 4.878594249201278, + "grad_norm": 0.4428717792034149, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6108 + }, + { + "epoch": 4.8793929712460065, + "grad_norm": 3.760467052459717, + "learning_rate": 0.0005, + "loss": 1.0756, + "step": 6109 + }, + { + "epoch": 4.880191693290735, + "grad_norm": 84.49950408935547, + "learning_rate": 0.0005, + "loss": 1.0762, + "step": 6110 + }, + { + "epoch": 4.8809904153354635, + "grad_norm": 66320516.0, + "learning_rate": 0.0005, + "loss": 1.1423, + "step": 6111 + }, + { + "epoch": 4.881789137380192, + "grad_norm": 676613568.0, + "learning_rate": 0.0005, + "loss": 1.1818, + "step": 6112 + }, + { + "epoch": 4.88258785942492, + "grad_norm": 2556641280.0, + "learning_rate": 0.0005, + "loss": 1.2458, + "step": 6113 + }, + { + "epoch": 4.883386581469648, + "grad_norm": 21960.341796875, + "learning_rate": 0.0005, + "loss": 1.3163, + "step": 6114 + }, + { + "epoch": 4.884185303514377, + "grad_norm": 3668.3603515625, + "learning_rate": 0.0005, + "loss": 1.4954, + "step": 6115 + }, + { + "epoch": 4.884984025559105, + "grad_norm": 9.501830101013184, + "learning_rate": 0.0005, + "loss": 2.0388, + "step": 6116 + }, + { + "epoch": 4.885782747603834, + "grad_norm": 1.9570647478103638, + "learning_rate": 0.0005, + "loss": 1.3693, + "step": 6117 + }, + { + "epoch": 4.886581469648562, + "grad_norm": 0.9678036570549011, + "learning_rate": 0.0005, + "loss": 1.2694, + "step": 6118 + }, + { + "epoch": 4.887380191693291, + "grad_norm": 0.7094120383262634, + "learning_rate": 0.0005, + "loss": 1.2114, + "step": 6119 + }, + { + "epoch": 4.888178913738019, + "grad_norm": 0.4029041826725006, + "learning_rate": 0.0005, + "loss": 1.1809, + "step": 6120 + }, + { + "epoch": 4.888977635782748, + "grad_norm": 0.8682520389556885, + "learning_rate": 0.0005, + "loss": 1.1689, + "step": 6121 + }, + { + "epoch": 4.8897763578274756, + "grad_norm": 0.5829207301139832, + "learning_rate": 0.0005, + "loss": 1.1921, + "step": 6122 + }, + { + "epoch": 4.890575079872205, + "grad_norm": 0.5038579702377319, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6123 + }, + { + "epoch": 4.891373801916933, + "grad_norm": 0.532597005367279, + "learning_rate": 0.0005, + "loss": 1.1904, + "step": 6124 + }, + { + "epoch": 4.892172523961661, + "grad_norm": 0.20122192800045013, + "learning_rate": 0.0005, + "loss": 1.1399, + "step": 6125 + }, + { + "epoch": 4.89297124600639, + "grad_norm": 0.22419369220733643, + "learning_rate": 0.0005, + "loss": 1.1296, + "step": 6126 + }, + { + "epoch": 4.893769968051118, + "grad_norm": 0.2319759726524353, + "learning_rate": 0.0005, + "loss": 1.13, + "step": 6127 + }, + { + "epoch": 4.894568690095847, + "grad_norm": 0.18733178079128265, + "learning_rate": 0.0005, + "loss": 1.1203, + "step": 6128 + }, + { + "epoch": 4.895367412140575, + "grad_norm": 0.35497167706489563, + "learning_rate": 0.0005, + "loss": 1.1135, + "step": 6129 + }, + { + "epoch": 4.896166134185304, + "grad_norm": 0.2551584243774414, + "learning_rate": 0.0005, + "loss": 1.1236, + "step": 6130 + }, + { + "epoch": 4.896964856230032, + "grad_norm": 0.337982714176178, + "learning_rate": 0.0005, + "loss": 1.1114, + "step": 6131 + }, + { + "epoch": 4.897763578274761, + "grad_norm": 0.2945634722709656, + "learning_rate": 0.0005, + "loss": 1.1048, + "step": 6132 + }, + { + "epoch": 4.8985623003194885, + "grad_norm": 0.2571047842502594, + "learning_rate": 0.0005, + "loss": 1.101, + "step": 6133 + }, + { + "epoch": 4.899361022364217, + "grad_norm": 0.23297041654586792, + "learning_rate": 0.0005, + "loss": 1.0974, + "step": 6134 + }, + { + "epoch": 4.9001597444089455, + "grad_norm": 0.24131764471530914, + "learning_rate": 0.0005, + "loss": 1.0976, + "step": 6135 + }, + { + "epoch": 4.900958466453674, + "grad_norm": 0.22283275425434113, + "learning_rate": 0.0005, + "loss": 1.0951, + "step": 6136 + }, + { + "epoch": 4.901757188498403, + "grad_norm": 0.1691826730966568, + "learning_rate": 0.0005, + "loss": 1.0882, + "step": 6137 + }, + { + "epoch": 4.902555910543131, + "grad_norm": 0.1532466858625412, + "learning_rate": 0.0005, + "loss": 1.0887, + "step": 6138 + }, + { + "epoch": 4.90335463258786, + "grad_norm": 0.14135177433490753, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 6139 + }, + { + "epoch": 4.904153354632588, + "grad_norm": 0.14410537481307983, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 6140 + }, + { + "epoch": 4.904952076677317, + "grad_norm": 0.1097448468208313, + "learning_rate": 0.0005, + "loss": 1.0779, + "step": 6141 + }, + { + "epoch": 4.905750798722044, + "grad_norm": 0.0851673111319542, + "learning_rate": 0.0005, + "loss": 1.0842, + "step": 6142 + }, + { + "epoch": 4.906549520766773, + "grad_norm": 0.13842107355594635, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 6143 + }, + { + "epoch": 4.907348242811501, + "grad_norm": 0.15126317739486694, + "learning_rate": 0.0005, + "loss": 1.0838, + "step": 6144 + }, + { + "epoch": 4.90814696485623, + "grad_norm": 0.13176177442073822, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6145 + }, + { + "epoch": 4.9089456869009584, + "grad_norm": 0.164788156747818, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 6146 + }, + { + "epoch": 4.909744408945687, + "grad_norm": 0.24943718314170837, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6147 + }, + { + "epoch": 4.9105431309904155, + "grad_norm": 0.4325760304927826, + "learning_rate": 0.0005, + "loss": 1.0741, + "step": 6148 + }, + { + "epoch": 4.911341853035144, + "grad_norm": 0.5711309313774109, + "learning_rate": 0.0005, + "loss": 1.0832, + "step": 6149 + }, + { + "epoch": 4.912140575079873, + "grad_norm": 0.37636998295783997, + "learning_rate": 0.0005, + "loss": 1.0816, + "step": 6150 + }, + { + "epoch": 4.9129392971246, + "grad_norm": 0.2788292169570923, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 6151 + }, + { + "epoch": 4.913738019169329, + "grad_norm": 0.31709909439086914, + "learning_rate": 0.0005, + "loss": 1.0813, + "step": 6152 + }, + { + "epoch": 4.914536741214057, + "grad_norm": 0.14585916697978973, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6153 + }, + { + "epoch": 4.915335463258786, + "grad_norm": 0.1302923858165741, + "learning_rate": 0.0005, + "loss": 1.0722, + "step": 6154 + }, + { + "epoch": 4.916134185303514, + "grad_norm": 0.16156400740146637, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6155 + }, + { + "epoch": 4.916932907348243, + "grad_norm": 0.2323192059993744, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6156 + }, + { + "epoch": 4.917731629392971, + "grad_norm": 0.17504405975341797, + "learning_rate": 0.0005, + "loss": 1.072, + "step": 6157 + }, + { + "epoch": 4.9185303514377, + "grad_norm": 0.07211807370185852, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6158 + }, + { + "epoch": 4.919329073482428, + "grad_norm": 0.26426371932029724, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6159 + }, + { + "epoch": 4.920127795527156, + "grad_norm": 0.237858384847641, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 6160 + }, + { + "epoch": 4.9209265175718855, + "grad_norm": 0.23863473534584045, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6161 + }, + { + "epoch": 4.921725239616613, + "grad_norm": 0.3053814768791199, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6162 + }, + { + "epoch": 4.922523961661342, + "grad_norm": 0.2143447995185852, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6163 + }, + { + "epoch": 4.92332268370607, + "grad_norm": 0.12295633554458618, + "learning_rate": 0.0005, + "loss": 1.0827, + "step": 6164 + }, + { + "epoch": 4.924121405750799, + "grad_norm": 0.11128787696361542, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6165 + }, + { + "epoch": 4.924920127795527, + "grad_norm": 0.158652663230896, + "learning_rate": 0.0005, + "loss": 1.075, + "step": 6166 + }, + { + "epoch": 4.925718849840256, + "grad_norm": 0.17612649500370026, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6167 + }, + { + "epoch": 4.926517571884984, + "grad_norm": 0.12243206799030304, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6168 + }, + { + "epoch": 4.927316293929713, + "grad_norm": 0.12234453856945038, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 6169 + }, + { + "epoch": 4.928115015974441, + "grad_norm": 0.1968356966972351, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6170 + }, + { + "epoch": 4.928913738019169, + "grad_norm": 0.17286576330661774, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6171 + }, + { + "epoch": 4.9297124600638975, + "grad_norm": 0.0847749337553978, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6172 + }, + { + "epoch": 4.930511182108626, + "grad_norm": 0.0704331174492836, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6173 + }, + { + "epoch": 4.931309904153355, + "grad_norm": 0.12671123445034027, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6174 + }, + { + "epoch": 4.932108626198083, + "grad_norm": 0.10653524100780487, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6175 + }, + { + "epoch": 4.932907348242812, + "grad_norm": 0.0606958381831646, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6176 + }, + { + "epoch": 4.93370607028754, + "grad_norm": 0.12248247116804123, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6177 + }, + { + "epoch": 4.934504792332269, + "grad_norm": 0.1370074301958084, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6178 + }, + { + "epoch": 4.935303514376997, + "grad_norm": 0.05940835922956467, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6179 + }, + { + "epoch": 4.936102236421725, + "grad_norm": 0.1440308690071106, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6180 + }, + { + "epoch": 4.936900958466453, + "grad_norm": 0.1972372829914093, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6181 + }, + { + "epoch": 4.937699680511182, + "grad_norm": 0.10575850307941437, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6182 + }, + { + "epoch": 4.93849840255591, + "grad_norm": 0.11902400851249695, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6183 + }, + { + "epoch": 4.939297124600639, + "grad_norm": 0.15276090800762177, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6184 + }, + { + "epoch": 4.9400958466453675, + "grad_norm": 0.07495112717151642, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 6185 + }, + { + "epoch": 4.940894568690096, + "grad_norm": 0.10652542859315872, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6186 + }, + { + "epoch": 4.9416932907348246, + "grad_norm": 0.11347164958715439, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6187 + }, + { + "epoch": 4.942492012779553, + "grad_norm": 0.19946135580539703, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6188 + }, + { + "epoch": 4.943290734824281, + "grad_norm": 0.0771450325846672, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6189 + }, + { + "epoch": 4.944089456869009, + "grad_norm": 0.1086430475115776, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6190 + }, + { + "epoch": 4.944888178913738, + "grad_norm": 0.08790839463472366, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6191 + }, + { + "epoch": 4.945686900958466, + "grad_norm": 0.22063800692558289, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6192 + }, + { + "epoch": 4.946485623003195, + "grad_norm": 0.22287815809249878, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6193 + }, + { + "epoch": 4.947284345047923, + "grad_norm": 1.695265769958496, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6194 + }, + { + "epoch": 4.948083067092652, + "grad_norm": 0.6316840052604675, + "learning_rate": 0.0005, + "loss": 1.0933, + "step": 6195 + }, + { + "epoch": 4.94888178913738, + "grad_norm": 0.35637202858924866, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 6196 + }, + { + "epoch": 4.949680511182109, + "grad_norm": 0.2844616174697876, + "learning_rate": 0.0005, + "loss": 1.0744, + "step": 6197 + }, + { + "epoch": 4.950479233226837, + "grad_norm": 0.19614022970199585, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 6198 + }, + { + "epoch": 4.951277955271565, + "grad_norm": 0.3665562868118286, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 6199 + }, + { + "epoch": 4.952076677316294, + "grad_norm": 0.1485169231891632, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 6200 + }, + { + "epoch": 4.952875399361022, + "grad_norm": 0.19647273421287537, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6201 + }, + { + "epoch": 4.953674121405751, + "grad_norm": 0.19809085130691528, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6202 + }, + { + "epoch": 4.954472843450479, + "grad_norm": 0.1129874736070633, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6203 + }, + { + "epoch": 4.955271565495208, + "grad_norm": 0.2082832157611847, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6204 + }, + { + "epoch": 4.956070287539936, + "grad_norm": 0.20414425432682037, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 6205 + }, + { + "epoch": 4.956869009584665, + "grad_norm": 0.16667422652244568, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6206 + }, + { + "epoch": 4.957667731629393, + "grad_norm": 0.25111839175224304, + "learning_rate": 0.0005, + "loss": 1.0711, + "step": 6207 + }, + { + "epoch": 4.958466453674122, + "grad_norm": 0.16995272040367126, + "learning_rate": 0.0005, + "loss": 1.068, + "step": 6208 + }, + { + "epoch": 4.9592651757188495, + "grad_norm": 0.10725044459104538, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6209 + }, + { + "epoch": 4.960063897763578, + "grad_norm": 0.17728300392627716, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6210 + }, + { + "epoch": 4.960862619808307, + "grad_norm": 0.1334110051393509, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6211 + }, + { + "epoch": 4.961661341853035, + "grad_norm": 0.14835794270038605, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6212 + }, + { + "epoch": 4.962460063897764, + "grad_norm": 0.14602027833461761, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6213 + }, + { + "epoch": 4.963258785942492, + "grad_norm": 0.162953719496727, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6214 + }, + { + "epoch": 4.964057507987221, + "grad_norm": 0.7214393615722656, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6215 + }, + { + "epoch": 4.964856230031949, + "grad_norm": 0.27030259370803833, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6216 + }, + { + "epoch": 4.965654952076678, + "grad_norm": 0.18558967113494873, + "learning_rate": 0.0005, + "loss": 1.0767, + "step": 6217 + }, + { + "epoch": 4.966453674121405, + "grad_norm": 0.09276804327964783, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6218 + }, + { + "epoch": 4.967252396166134, + "grad_norm": 0.11957832425832748, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6219 + }, + { + "epoch": 4.968051118210862, + "grad_norm": 0.8338447213172913, + "learning_rate": 0.0005, + "loss": 1.07, + "step": 6220 + }, + { + "epoch": 4.968849840255591, + "grad_norm": 0.7283904552459717, + "learning_rate": 0.0005, + "loss": 1.0732, + "step": 6221 + }, + { + "epoch": 4.9696485623003195, + "grad_norm": 0.07938430458307266, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6222 + }, + { + "epoch": 4.970447284345048, + "grad_norm": 0.15368770062923431, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6223 + }, + { + "epoch": 4.9712460063897765, + "grad_norm": 0.08823438733816147, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6224 + }, + { + "epoch": 4.972044728434505, + "grad_norm": 0.07656054943799973, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6225 + }, + { + "epoch": 4.972843450479234, + "grad_norm": 0.08777901530265808, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6226 + }, + { + "epoch": 4.973642172523961, + "grad_norm": 0.09863653033971786, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6227 + }, + { + "epoch": 4.97444089456869, + "grad_norm": 0.13259904086589813, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6228 + }, + { + "epoch": 4.975239616613418, + "grad_norm": 0.08148759603500366, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6229 + }, + { + "epoch": 4.976038338658147, + "grad_norm": 0.06982999294996262, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6230 + }, + { + "epoch": 4.976837060702875, + "grad_norm": 0.09279565513134003, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6231 + }, + { + "epoch": 4.977635782747604, + "grad_norm": 0.05821947008371353, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6232 + }, + { + "epoch": 4.978434504792332, + "grad_norm": 0.07475738972425461, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6233 + }, + { + "epoch": 4.979233226837061, + "grad_norm": 0.10464147478342056, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6234 + }, + { + "epoch": 4.9800319488817895, + "grad_norm": 0.08045687526464462, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6235 + }, + { + "epoch": 4.980830670926517, + "grad_norm": 0.08045300841331482, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 6236 + }, + { + "epoch": 4.981629392971246, + "grad_norm": 0.10313838720321655, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6237 + }, + { + "epoch": 4.982428115015974, + "grad_norm": 0.08065208047628403, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6238 + }, + { + "epoch": 4.983226837060703, + "grad_norm": 0.0807032585144043, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6239 + }, + { + "epoch": 4.984025559105431, + "grad_norm": 0.06274307519197464, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6240 + }, + { + "epoch": 4.98482428115016, + "grad_norm": 0.07299554347991943, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6241 + }, + { + "epoch": 4.985623003194888, + "grad_norm": 0.0592481754720211, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6242 + }, + { + "epoch": 4.986421725239617, + "grad_norm": 0.0766056478023529, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6243 + }, + { + "epoch": 4.987220447284345, + "grad_norm": 0.07707066088914871, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6244 + }, + { + "epoch": 4.988019169329074, + "grad_norm": 0.7231665849685669, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6245 + }, + { + "epoch": 4.988817891373802, + "grad_norm": 0.0678652748465538, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6246 + }, + { + "epoch": 4.98961661341853, + "grad_norm": 3.667872905731201, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6247 + }, + { + "epoch": 4.9904153354632586, + "grad_norm": 0.2416938990354538, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6248 + }, + { + "epoch": 4.991214057507987, + "grad_norm": 0.27054834365844727, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6249 + }, + { + "epoch": 4.992012779552716, + "grad_norm": 0.1435888707637787, + "learning_rate": 0.0005, + "loss": 1.0757, + "step": 6250 + }, + { + "epoch": 4.992811501597444, + "grad_norm": 0.1542683094739914, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6251 + }, + { + "epoch": 4.993610223642173, + "grad_norm": 0.1867702603340149, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6252 + }, + { + "epoch": 4.994408945686901, + "grad_norm": 0.09558507800102234, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6253 + }, + { + "epoch": 4.99520766773163, + "grad_norm": 0.3019699156284332, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6254 + }, + { + "epoch": 4.996006389776358, + "grad_norm": 0.11987117677927017, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6255 + }, + { + "epoch": 4.996805111821086, + "grad_norm": 0.11792664974927902, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6256 + }, + { + "epoch": 4.997603833865814, + "grad_norm": 0.15580247342586517, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6257 + }, + { + "epoch": 4.998402555910543, + "grad_norm": 0.20167642831802368, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6258 + }, + { + "epoch": 4.9992012779552715, + "grad_norm": 0.11203871667385101, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6259 + }, + { + "epoch": 5.0, + "grad_norm": 0.11081275343894958, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6260 + }, + { + "epoch": 5.0007987220447285, + "grad_norm": 0.11213719099760056, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6261 + }, + { + "epoch": 5.001597444089457, + "grad_norm": 0.11074960231781006, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6262 + }, + { + "epoch": 5.002396166134186, + "grad_norm": 0.07538039237260818, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6263 + }, + { + "epoch": 5.003194888178914, + "grad_norm": 0.0824185386300087, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6264 + }, + { + "epoch": 5.003993610223642, + "grad_norm": 0.08940225094556808, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6265 + }, + { + "epoch": 5.00479233226837, + "grad_norm": 0.07072590291500092, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6266 + }, + { + "epoch": 5.005591054313099, + "grad_norm": 0.13027220964431763, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6267 + }, + { + "epoch": 5.006389776357827, + "grad_norm": 0.09226793050765991, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 6268 + }, + { + "epoch": 5.007188498402556, + "grad_norm": 0.1879013329744339, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6269 + }, + { + "epoch": 5.007987220447284, + "grad_norm": 0.09063144028186798, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6270 + }, + { + "epoch": 5.008785942492013, + "grad_norm": 0.09013621509075165, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6271 + }, + { + "epoch": 5.0095846645367414, + "grad_norm": 0.2404542863368988, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6272 + }, + { + "epoch": 5.01038338658147, + "grad_norm": 0.11968059092760086, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6273 + }, + { + "epoch": 5.0111821086261985, + "grad_norm": 0.16429072618484497, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 6274 + }, + { + "epoch": 5.011980830670926, + "grad_norm": 0.08745420724153519, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6275 + }, + { + "epoch": 5.012779552715655, + "grad_norm": 0.09130390733480453, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6276 + }, + { + "epoch": 5.013578274760383, + "grad_norm": 0.06996344774961472, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6277 + }, + { + "epoch": 5.014376996805112, + "grad_norm": 0.06063826382160187, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6278 + }, + { + "epoch": 5.01517571884984, + "grad_norm": 0.14752542972564697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6279 + }, + { + "epoch": 5.015974440894569, + "grad_norm": 0.05987429618835449, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6280 + }, + { + "epoch": 5.016773162939297, + "grad_norm": 0.1716211587190628, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6281 + }, + { + "epoch": 5.017571884984026, + "grad_norm": 0.13823190331459045, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 6282 + }, + { + "epoch": 5.018370607028754, + "grad_norm": 0.09764201194047928, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 6283 + }, + { + "epoch": 5.019169329073482, + "grad_norm": 0.07897874712944031, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6284 + }, + { + "epoch": 5.0199680511182105, + "grad_norm": 0.07823392748832703, + "learning_rate": 0.0005, + "loss": 1.0676, + "step": 6285 + }, + { + "epoch": 5.020766773162939, + "grad_norm": 0.1033136323094368, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6286 + }, + { + "epoch": 5.021565495207668, + "grad_norm": 0.07100827991962433, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6287 + }, + { + "epoch": 5.022364217252396, + "grad_norm": 0.40211987495422363, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 6288 + }, + { + "epoch": 5.023162939297125, + "grad_norm": 0.15459896624088287, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6289 + }, + { + "epoch": 5.023961661341853, + "grad_norm": 0.07789050787687302, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6290 + }, + { + "epoch": 5.024760383386582, + "grad_norm": 0.2116134762763977, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6291 + }, + { + "epoch": 5.02555910543131, + "grad_norm": 0.1842123568058014, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6292 + }, + { + "epoch": 5.026357827476039, + "grad_norm": 0.2037680447101593, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6293 + }, + { + "epoch": 5.027156549520766, + "grad_norm": 0.10851238667964935, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6294 + }, + { + "epoch": 5.027955271565495, + "grad_norm": 0.14465196430683136, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6295 + }, + { + "epoch": 5.0287539936102235, + "grad_norm": 0.11993128806352615, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6296 + }, + { + "epoch": 5.029552715654952, + "grad_norm": 0.13647349178791046, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6297 + }, + { + "epoch": 5.0303514376996805, + "grad_norm": 0.11265698075294495, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 6298 + }, + { + "epoch": 5.031150159744409, + "grad_norm": 18.601808547973633, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6299 + }, + { + "epoch": 5.031948881789138, + "grad_norm": 0.40079689025878906, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6300 + }, + { + "epoch": 5.032747603833866, + "grad_norm": 3.513967752456665, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6301 + }, + { + "epoch": 5.033546325878595, + "grad_norm": 24.040191650390625, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6302 + }, + { + "epoch": 5.034345047923322, + "grad_norm": 0.7786405086517334, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 6303 + }, + { + "epoch": 5.035143769968051, + "grad_norm": 0.619868814945221, + "learning_rate": 0.0005, + "loss": 1.0719, + "step": 6304 + }, + { + "epoch": 5.035942492012779, + "grad_norm": 6.039219379425049, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6305 + }, + { + "epoch": 5.036741214057508, + "grad_norm": 23.90920639038086, + "learning_rate": 0.0005, + "loss": 1.0961, + "step": 6306 + }, + { + "epoch": 5.037539936102236, + "grad_norm": 1.296809196472168, + "learning_rate": 0.0005, + "loss": 1.1649, + "step": 6307 + }, + { + "epoch": 5.038338658146965, + "grad_norm": 0.7673514485359192, + "learning_rate": 0.0005, + "loss": 1.151, + "step": 6308 + }, + { + "epoch": 5.039137380191693, + "grad_norm": 0.5065979957580566, + "learning_rate": 0.0005, + "loss": 1.1334, + "step": 6309 + }, + { + "epoch": 5.039936102236422, + "grad_norm": 0.3858639597892761, + "learning_rate": 0.0005, + "loss": 1.1453, + "step": 6310 + }, + { + "epoch": 5.0407348242811505, + "grad_norm": 0.2647075653076172, + "learning_rate": 0.0005, + "loss": 1.1238, + "step": 6311 + }, + { + "epoch": 5.041533546325879, + "grad_norm": 0.2713094651699066, + "learning_rate": 0.0005, + "loss": 1.1112, + "step": 6312 + }, + { + "epoch": 5.042332268370607, + "grad_norm": 0.2573802173137665, + "learning_rate": 0.0005, + "loss": 1.1068, + "step": 6313 + }, + { + "epoch": 5.043130990415335, + "grad_norm": 0.2083175778388977, + "learning_rate": 0.0005, + "loss": 1.0897, + "step": 6314 + }, + { + "epoch": 5.043929712460064, + "grad_norm": 0.3625626564025879, + "learning_rate": 0.0005, + "loss": 1.0979, + "step": 6315 + }, + { + "epoch": 5.044728434504792, + "grad_norm": 0.331129789352417, + "learning_rate": 0.0005, + "loss": 1.0931, + "step": 6316 + }, + { + "epoch": 5.045527156549521, + "grad_norm": 0.23352555930614471, + "learning_rate": 0.0005, + "loss": 1.0872, + "step": 6317 + }, + { + "epoch": 5.046325878594249, + "grad_norm": 0.24043256044387817, + "learning_rate": 0.0005, + "loss": 1.0941, + "step": 6318 + }, + { + "epoch": 5.047124600638978, + "grad_norm": 0.31510207056999207, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 6319 + }, + { + "epoch": 5.047923322683706, + "grad_norm": 0.6896952390670776, + "learning_rate": 0.0005, + "loss": 1.0992, + "step": 6320 + }, + { + "epoch": 5.048722044728435, + "grad_norm": 0.7915457487106323, + "learning_rate": 0.0005, + "loss": 1.1037, + "step": 6321 + }, + { + "epoch": 5.0495207667731625, + "grad_norm": 0.2959117889404297, + "learning_rate": 0.0005, + "loss": 1.0828, + "step": 6322 + }, + { + "epoch": 5.050319488817891, + "grad_norm": 0.44844529032707214, + "learning_rate": 0.0005, + "loss": 1.0879, + "step": 6323 + }, + { + "epoch": 5.05111821086262, + "grad_norm": 0.3385697305202484, + "learning_rate": 0.0005, + "loss": 1.077, + "step": 6324 + }, + { + "epoch": 5.051916932907348, + "grad_norm": 0.31220802664756775, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 6325 + }, + { + "epoch": 5.052715654952077, + "grad_norm": 0.3420731723308563, + "learning_rate": 0.0005, + "loss": 1.0811, + "step": 6326 + }, + { + "epoch": 5.053514376996805, + "grad_norm": 0.3061322569847107, + "learning_rate": 0.0005, + "loss": 1.0728, + "step": 6327 + }, + { + "epoch": 5.054313099041534, + "grad_norm": 0.6878030300140381, + "learning_rate": 0.0005, + "loss": 1.0773, + "step": 6328 + }, + { + "epoch": 5.055111821086262, + "grad_norm": 0.1927136927843094, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 6329 + }, + { + "epoch": 5.055910543130991, + "grad_norm": 0.24812163412570953, + "learning_rate": 0.0005, + "loss": 1.0738, + "step": 6330 + }, + { + "epoch": 5.056709265175719, + "grad_norm": 0.19675321877002716, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6331 + }, + { + "epoch": 5.057507987220447, + "grad_norm": 0.20720984041690826, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 6332 + }, + { + "epoch": 5.0583067092651754, + "grad_norm": 0.1260477900505066, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6333 + }, + { + "epoch": 5.059105431309904, + "grad_norm": 0.24399158358573914, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6334 + }, + { + "epoch": 5.0599041533546325, + "grad_norm": 0.22406993806362152, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6335 + }, + { + "epoch": 5.060702875399361, + "grad_norm": 0.24807684123516083, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 6336 + }, + { + "epoch": 5.06150159744409, + "grad_norm": 0.1272616684436798, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6337 + }, + { + "epoch": 5.062300319488818, + "grad_norm": 0.2053418755531311, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 6338 + }, + { + "epoch": 5.063099041533547, + "grad_norm": 0.13628287613391876, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6339 + }, + { + "epoch": 5.063897763578275, + "grad_norm": 0.21262522041797638, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 6340 + }, + { + "epoch": 5.064696485623003, + "grad_norm": 0.3784351646900177, + "learning_rate": 0.0005, + "loss": 1.0685, + "step": 6341 + }, + { + "epoch": 5.065495207667731, + "grad_norm": 0.3282131552696228, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6342 + }, + { + "epoch": 5.06629392971246, + "grad_norm": 0.10128312557935715, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6343 + }, + { + "epoch": 5.067092651757188, + "grad_norm": 0.2297000139951706, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6344 + }, + { + "epoch": 5.067891373801917, + "grad_norm": 0.11327458173036575, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6345 + }, + { + "epoch": 5.068690095846645, + "grad_norm": 0.16150346398353577, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6346 + }, + { + "epoch": 5.069488817891374, + "grad_norm": 0.15486986935138702, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6347 + }, + { + "epoch": 5.0702875399361025, + "grad_norm": 0.12427826225757599, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6348 + }, + { + "epoch": 5.071086261980831, + "grad_norm": 0.11321424692869186, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6349 + }, + { + "epoch": 5.0718849840255595, + "grad_norm": 0.12668851017951965, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 6350 + }, + { + "epoch": 5.072683706070287, + "grad_norm": 0.20059579610824585, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6351 + }, + { + "epoch": 5.073482428115016, + "grad_norm": 0.14591605961322784, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6352 + }, + { + "epoch": 5.074281150159744, + "grad_norm": 0.19168664515018463, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6353 + }, + { + "epoch": 5.075079872204473, + "grad_norm": 0.19381079077720642, + "learning_rate": 0.0005, + "loss": 1.0702, + "step": 6354 + }, + { + "epoch": 5.075878594249201, + "grad_norm": 0.0957496389746666, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6355 + }, + { + "epoch": 5.07667731629393, + "grad_norm": 0.11414145678281784, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6356 + }, + { + "epoch": 5.077476038338658, + "grad_norm": 0.10855124145746231, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6357 + }, + { + "epoch": 5.078274760383387, + "grad_norm": 0.2300068736076355, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 6358 + }, + { + "epoch": 5.079073482428115, + "grad_norm": 0.15098270773887634, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 6359 + }, + { + "epoch": 5.079872204472843, + "grad_norm": 0.09821227937936783, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6360 + }, + { + "epoch": 5.080670926517572, + "grad_norm": 0.135583758354187, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6361 + }, + { + "epoch": 5.0814696485623, + "grad_norm": 0.07262608408927917, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6362 + }, + { + "epoch": 5.082268370607029, + "grad_norm": 0.10731761902570724, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6363 + }, + { + "epoch": 5.083067092651757, + "grad_norm": 0.27508556842803955, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6364 + }, + { + "epoch": 5.083865814696486, + "grad_norm": 0.12996995449066162, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6365 + }, + { + "epoch": 5.084664536741214, + "grad_norm": 0.10386788845062256, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6366 + }, + { + "epoch": 5.085463258785943, + "grad_norm": 0.07591816782951355, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6367 + }, + { + "epoch": 5.086261980830671, + "grad_norm": 0.09341761469841003, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6368 + }, + { + "epoch": 5.0870607028754, + "grad_norm": 0.12575088441371918, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6369 + }, + { + "epoch": 5.087859424920127, + "grad_norm": 0.3423956036567688, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 6370 + }, + { + "epoch": 5.088658146964856, + "grad_norm": 0.2154775857925415, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6371 + }, + { + "epoch": 5.0894568690095845, + "grad_norm": 0.1550479382276535, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 6372 + }, + { + "epoch": 5.090255591054313, + "grad_norm": 0.08802525699138641, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6373 + }, + { + "epoch": 5.0910543130990416, + "grad_norm": 0.08421735465526581, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6374 + }, + { + "epoch": 5.09185303514377, + "grad_norm": 0.08920808881521225, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 6375 + }, + { + "epoch": 5.092651757188499, + "grad_norm": 0.1450507938861847, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6376 + }, + { + "epoch": 5.093450479233227, + "grad_norm": 0.16926947236061096, + "learning_rate": 0.0005, + "loss": 1.0729, + "step": 6377 + }, + { + "epoch": 5.094249201277956, + "grad_norm": 0.6995428204536438, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6378 + }, + { + "epoch": 5.095047923322683, + "grad_norm": 0.10353969782590866, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6379 + }, + { + "epoch": 5.095846645367412, + "grad_norm": 0.09132180362939835, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6380 + }, + { + "epoch": 5.09664536741214, + "grad_norm": 0.17745476961135864, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6381 + }, + { + "epoch": 5.097444089456869, + "grad_norm": 0.10596930980682373, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6382 + }, + { + "epoch": 5.098242811501597, + "grad_norm": 0.11676348745822906, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6383 + }, + { + "epoch": 5.099041533546326, + "grad_norm": 0.13022664189338684, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6384 + }, + { + "epoch": 5.0998402555910545, + "grad_norm": 0.11169753223657608, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6385 + }, + { + "epoch": 5.100638977635783, + "grad_norm": 0.07439867407083511, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6386 + }, + { + "epoch": 5.1014376996805115, + "grad_norm": 0.06953777372837067, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6387 + }, + { + "epoch": 5.102236421725239, + "grad_norm": 0.09419669955968857, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6388 + }, + { + "epoch": 5.103035143769968, + "grad_norm": 0.1166587546467781, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6389 + }, + { + "epoch": 5.103833865814696, + "grad_norm": 0.5776185393333435, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 6390 + }, + { + "epoch": 5.104632587859425, + "grad_norm": 0.13175810873508453, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6391 + }, + { + "epoch": 5.105431309904153, + "grad_norm": 0.09372890740633011, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 6392 + }, + { + "epoch": 5.106230031948882, + "grad_norm": 0.25262513756752014, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6393 + }, + { + "epoch": 5.10702875399361, + "grad_norm": 0.1348644196987152, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6394 + }, + { + "epoch": 5.107827476038339, + "grad_norm": 0.23879335820674896, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6395 + }, + { + "epoch": 5.108626198083067, + "grad_norm": 0.25561729073524475, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 6396 + }, + { + "epoch": 5.109424920127796, + "grad_norm": 0.26974916458129883, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6397 + }, + { + "epoch": 5.110223642172524, + "grad_norm": 0.1866329163312912, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6398 + }, + { + "epoch": 5.111022364217252, + "grad_norm": 0.22104737162590027, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6399 + }, + { + "epoch": 5.111821086261981, + "grad_norm": 0.3775753676891327, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6400 + }, + { + "epoch": 5.112619808306709, + "grad_norm": 0.20636002719402313, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 6401 + }, + { + "epoch": 5.113418530351438, + "grad_norm": 0.1941772699356079, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6402 + }, + { + "epoch": 5.114217252396166, + "grad_norm": 0.14595480263233185, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 6403 + }, + { + "epoch": 5.115015974440895, + "grad_norm": 0.16794493794441223, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6404 + }, + { + "epoch": 5.115814696485623, + "grad_norm": 0.16466112434864044, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6405 + }, + { + "epoch": 5.116613418530352, + "grad_norm": 0.27192312479019165, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6406 + }, + { + "epoch": 5.11741214057508, + "grad_norm": 0.296017050743103, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6407 + }, + { + "epoch": 5.118210862619808, + "grad_norm": 0.24947655200958252, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6408 + }, + { + "epoch": 5.1190095846645365, + "grad_norm": 0.07843278348445892, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6409 + }, + { + "epoch": 5.119808306709265, + "grad_norm": 0.2507891356945038, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6410 + }, + { + "epoch": 5.1206070287539935, + "grad_norm": 0.2962022125720978, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6411 + }, + { + "epoch": 5.121405750798722, + "grad_norm": 0.21588601171970367, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6412 + }, + { + "epoch": 5.122204472843451, + "grad_norm": 0.27223092317581177, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6413 + }, + { + "epoch": 5.123003194888179, + "grad_norm": 0.1475650519132614, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6414 + }, + { + "epoch": 5.123801916932908, + "grad_norm": 0.2624805271625519, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6415 + }, + { + "epoch": 5.124600638977636, + "grad_norm": 0.27691081166267395, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6416 + }, + { + "epoch": 5.125399361022364, + "grad_norm": 0.1828494369983673, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6417 + }, + { + "epoch": 5.126198083067092, + "grad_norm": 0.27542614936828613, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 6418 + }, + { + "epoch": 5.126996805111821, + "grad_norm": 0.16250371932983398, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6419 + }, + { + "epoch": 5.127795527156549, + "grad_norm": 0.17180733382701874, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6420 + }, + { + "epoch": 5.128594249201278, + "grad_norm": 0.21466004848480225, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 6421 + }, + { + "epoch": 5.1293929712460065, + "grad_norm": 0.13144539296627045, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6422 + }, + { + "epoch": 5.130191693290735, + "grad_norm": 0.158688023686409, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6423 + }, + { + "epoch": 5.1309904153354635, + "grad_norm": 0.1430175006389618, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6424 + }, + { + "epoch": 5.131789137380192, + "grad_norm": 0.0988554134964943, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6425 + }, + { + "epoch": 5.13258785942492, + "grad_norm": 0.18320757150650024, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6426 + }, + { + "epoch": 5.133386581469648, + "grad_norm": 0.34172165393829346, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6427 + }, + { + "epoch": 5.134185303514377, + "grad_norm": 0.095450758934021, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6428 + }, + { + "epoch": 5.134984025559105, + "grad_norm": 0.2988479733467102, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6429 + }, + { + "epoch": 5.135782747603834, + "grad_norm": 0.11462085694074631, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6430 + }, + { + "epoch": 5.136581469648562, + "grad_norm": 0.11989153176546097, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6431 + }, + { + "epoch": 5.137380191693291, + "grad_norm": 0.15308552980422974, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6432 + }, + { + "epoch": 5.138178913738019, + "grad_norm": 0.1119944304227829, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6433 + }, + { + "epoch": 5.138977635782748, + "grad_norm": 0.38812172412872314, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6434 + }, + { + "epoch": 5.139776357827476, + "grad_norm": 0.24718649685382843, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6435 + }, + { + "epoch": 5.140575079872204, + "grad_norm": 0.15834778547286987, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6436 + }, + { + "epoch": 5.141373801916933, + "grad_norm": 0.1960451751947403, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 6437 + }, + { + "epoch": 5.142172523961661, + "grad_norm": 0.16195416450500488, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6438 + }, + { + "epoch": 5.14297124600639, + "grad_norm": 0.07554367184638977, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6439 + }, + { + "epoch": 5.143769968051118, + "grad_norm": 0.18924687802791595, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6440 + }, + { + "epoch": 5.144568690095847, + "grad_norm": 0.16253480315208435, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6441 + }, + { + "epoch": 5.145367412140575, + "grad_norm": 0.12711918354034424, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6442 + }, + { + "epoch": 5.146166134185304, + "grad_norm": 0.16831086575984955, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6443 + }, + { + "epoch": 5.146964856230032, + "grad_norm": 0.35199087858200073, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6444 + }, + { + "epoch": 5.147763578274761, + "grad_norm": 0.1340232491493225, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6445 + }, + { + "epoch": 5.1485623003194885, + "grad_norm": 0.1397274285554886, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 6446 + }, + { + "epoch": 5.149361022364217, + "grad_norm": 0.13868366181850433, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6447 + }, + { + "epoch": 5.1501597444089455, + "grad_norm": 0.08846192806959152, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 6448 + }, + { + "epoch": 5.150958466453674, + "grad_norm": 0.08350610733032227, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6449 + }, + { + "epoch": 5.151757188498403, + "grad_norm": 0.14727875590324402, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6450 + }, + { + "epoch": 5.152555910543131, + "grad_norm": 0.11705708503723145, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6451 + }, + { + "epoch": 5.15335463258786, + "grad_norm": 0.10308192670345306, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6452 + }, + { + "epoch": 5.154153354632588, + "grad_norm": 0.09459209442138672, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6453 + }, + { + "epoch": 5.154952076677317, + "grad_norm": 0.11605191230773926, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6454 + }, + { + "epoch": 5.155750798722044, + "grad_norm": 0.24275821447372437, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6455 + }, + { + "epoch": 5.156549520766773, + "grad_norm": 0.208640456199646, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6456 + }, + { + "epoch": 5.157348242811501, + "grad_norm": 0.15257662534713745, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6457 + }, + { + "epoch": 5.15814696485623, + "grad_norm": 0.10431355237960815, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6458 + }, + { + "epoch": 5.1589456869009584, + "grad_norm": 0.14187589287757874, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6459 + }, + { + "epoch": 5.159744408945687, + "grad_norm": 0.19084404408931732, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6460 + }, + { + "epoch": 5.1605431309904155, + "grad_norm": 0.09255128353834152, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6461 + }, + { + "epoch": 5.161341853035144, + "grad_norm": 0.1443471759557724, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6462 + }, + { + "epoch": 5.162140575079873, + "grad_norm": 0.36597245931625366, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6463 + }, + { + "epoch": 5.1629392971246, + "grad_norm": 0.3835389316082001, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6464 + }, + { + "epoch": 5.163738019169329, + "grad_norm": 0.14208771288394928, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6465 + }, + { + "epoch": 5.164536741214057, + "grad_norm": 0.2520706355571747, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6466 + }, + { + "epoch": 5.165335463258786, + "grad_norm": 0.2595224976539612, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6467 + }, + { + "epoch": 5.166134185303514, + "grad_norm": 0.15721063315868378, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6468 + }, + { + "epoch": 5.166932907348243, + "grad_norm": 0.1772007793188095, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6469 + }, + { + "epoch": 5.167731629392971, + "grad_norm": 0.19899888336658478, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6470 + }, + { + "epoch": 5.1685303514377, + "grad_norm": 0.18689346313476562, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6471 + }, + { + "epoch": 5.169329073482428, + "grad_norm": 0.16748468577861786, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6472 + }, + { + "epoch": 5.170127795527157, + "grad_norm": 0.13296879827976227, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6473 + }, + { + "epoch": 5.170926517571885, + "grad_norm": 0.18742166459560394, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6474 + }, + { + "epoch": 5.171725239616613, + "grad_norm": 0.17811308801174164, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6475 + }, + { + "epoch": 5.172523961661342, + "grad_norm": 0.1360485702753067, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 6476 + }, + { + "epoch": 5.17332268370607, + "grad_norm": 0.13431121408939362, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6477 + }, + { + "epoch": 5.174121405750799, + "grad_norm": 0.12888069450855255, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 6478 + }, + { + "epoch": 5.174920127795527, + "grad_norm": 0.15194712579250336, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6479 + }, + { + "epoch": 5.175718849840256, + "grad_norm": 0.13076889514923096, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6480 + }, + { + "epoch": 5.176517571884984, + "grad_norm": 0.14751110970973969, + "learning_rate": 0.0005, + "loss": 1.0693, + "step": 6481 + }, + { + "epoch": 5.177316293929713, + "grad_norm": 0.11919333785772324, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6482 + }, + { + "epoch": 5.178115015974441, + "grad_norm": 0.12712688744068146, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 6483 + }, + { + "epoch": 5.178913738019169, + "grad_norm": 0.13765369355678558, + "learning_rate": 0.0005, + "loss": 1.0704, + "step": 6484 + }, + { + "epoch": 5.1797124600638975, + "grad_norm": 0.11060373485088348, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 6485 + }, + { + "epoch": 5.180511182108626, + "grad_norm": 0.056882213801145554, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6486 + }, + { + "epoch": 5.181309904153355, + "grad_norm": 0.11317770928144455, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6487 + }, + { + "epoch": 5.182108626198083, + "grad_norm": 0.09279809147119522, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6488 + }, + { + "epoch": 5.182907348242812, + "grad_norm": 0.09392786771059036, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6489 + }, + { + "epoch": 5.18370607028754, + "grad_norm": 0.13042815029621124, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 6490 + }, + { + "epoch": 5.184504792332269, + "grad_norm": 0.07929978519678116, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6491 + }, + { + "epoch": 5.185303514376997, + "grad_norm": 0.12215851992368698, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 6492 + }, + { + "epoch": 5.186102236421725, + "grad_norm": 0.12000773102045059, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6493 + }, + { + "epoch": 5.186900958466453, + "grad_norm": 0.08427707850933075, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6494 + }, + { + "epoch": 5.187699680511182, + "grad_norm": 0.158653125166893, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6495 + }, + { + "epoch": 5.18849840255591, + "grad_norm": 0.11087878793478012, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6496 + }, + { + "epoch": 5.189297124600639, + "grad_norm": 0.12649668753147125, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6497 + }, + { + "epoch": 5.1900958466453675, + "grad_norm": 0.0821281224489212, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6498 + }, + { + "epoch": 5.190894568690096, + "grad_norm": 0.07192671298980713, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6499 + }, + { + "epoch": 5.1916932907348246, + "grad_norm": 0.10505214333534241, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6500 + }, + { + "epoch": 5.192492012779553, + "grad_norm": 0.11772353947162628, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6501 + }, + { + "epoch": 5.193290734824281, + "grad_norm": 0.15557901561260223, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6502 + }, + { + "epoch": 5.194089456869009, + "grad_norm": 0.09753020852804184, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6503 + }, + { + "epoch": 5.194888178913738, + "grad_norm": 0.10331830382347107, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6504 + }, + { + "epoch": 5.195686900958466, + "grad_norm": 0.130085289478302, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6505 + }, + { + "epoch": 5.196485623003195, + "grad_norm": 0.08772018551826477, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6506 + }, + { + "epoch": 5.197284345047923, + "grad_norm": 0.1906667798757553, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6507 + }, + { + "epoch": 5.198083067092652, + "grad_norm": 0.06724394112825394, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6508 + }, + { + "epoch": 5.19888178913738, + "grad_norm": 0.1141325905919075, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6509 + }, + { + "epoch": 5.199680511182109, + "grad_norm": 0.08354665338993073, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6510 + }, + { + "epoch": 5.2004792332268375, + "grad_norm": 0.1072440817952156, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6511 + }, + { + "epoch": 5.201277955271565, + "grad_norm": 0.10670839250087738, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6512 + }, + { + "epoch": 5.202076677316294, + "grad_norm": 0.10079781711101532, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6513 + }, + { + "epoch": 5.202875399361022, + "grad_norm": 0.1281125396490097, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6514 + }, + { + "epoch": 5.203674121405751, + "grad_norm": 0.1627720147371292, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6515 + }, + { + "epoch": 5.204472843450479, + "grad_norm": 0.1507575958967209, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6516 + }, + { + "epoch": 5.205271565495208, + "grad_norm": 0.17764779925346375, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6517 + }, + { + "epoch": 5.206070287539936, + "grad_norm": 0.1825307011604309, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6518 + }, + { + "epoch": 5.206869009584665, + "grad_norm": 0.1151907742023468, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6519 + }, + { + "epoch": 5.207667731629393, + "grad_norm": 0.1425708830356598, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6520 + }, + { + "epoch": 5.208466453674121, + "grad_norm": 0.08555550873279572, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 6521 + }, + { + "epoch": 5.2092651757188495, + "grad_norm": 0.15400084853172302, + "learning_rate": 0.0005, + "loss": 1.0447, + "step": 6522 + }, + { + "epoch": 5.210063897763578, + "grad_norm": 0.11088921129703522, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6523 + }, + { + "epoch": 5.210862619808307, + "grad_norm": 0.0959518551826477, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6524 + }, + { + "epoch": 5.211661341853035, + "grad_norm": 0.1054866686463356, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6525 + }, + { + "epoch": 5.212460063897764, + "grad_norm": 0.17849107086658478, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6526 + }, + { + "epoch": 5.213258785942492, + "grad_norm": 0.0910423994064331, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6527 + }, + { + "epoch": 5.214057507987221, + "grad_norm": 0.10857872664928436, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6528 + }, + { + "epoch": 5.214856230031949, + "grad_norm": 0.09012399613857269, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 6529 + }, + { + "epoch": 5.215654952076678, + "grad_norm": 0.14724178612232208, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 6530 + }, + { + "epoch": 5.216453674121405, + "grad_norm": 0.11357409507036209, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6531 + }, + { + "epoch": 5.217252396166134, + "grad_norm": 0.09721364825963974, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6532 + }, + { + "epoch": 5.218051118210862, + "grad_norm": 0.07837430387735367, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6533 + }, + { + "epoch": 5.218849840255591, + "grad_norm": 0.1181735098361969, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6534 + }, + { + "epoch": 5.2196485623003195, + "grad_norm": 0.07066017389297485, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6535 + }, + { + "epoch": 5.220447284345048, + "grad_norm": 0.06838417053222656, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6536 + }, + { + "epoch": 5.2212460063897765, + "grad_norm": 0.0919245257973671, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6537 + }, + { + "epoch": 5.222044728434505, + "grad_norm": 0.06859984248876572, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6538 + }, + { + "epoch": 5.222843450479234, + "grad_norm": 1.929213523864746, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 6539 + }, + { + "epoch": 5.223642172523961, + "grad_norm": 0.11181562393903732, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6540 + }, + { + "epoch": 5.22444089456869, + "grad_norm": 0.09261998534202576, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6541 + }, + { + "epoch": 5.225239616613418, + "grad_norm": 0.11214403063058853, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6542 + }, + { + "epoch": 5.226038338658147, + "grad_norm": 0.1353820264339447, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6543 + }, + { + "epoch": 5.226837060702875, + "grad_norm": 0.11579953879117966, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 6544 + }, + { + "epoch": 5.227635782747604, + "grad_norm": 0.08284885436296463, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6545 + }, + { + "epoch": 5.228434504792332, + "grad_norm": 0.13805733621120453, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6546 + }, + { + "epoch": 5.229233226837061, + "grad_norm": 0.08924185484647751, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6547 + }, + { + "epoch": 5.2300319488817895, + "grad_norm": 0.10975285619497299, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6548 + }, + { + "epoch": 5.230830670926518, + "grad_norm": 0.10500271618366241, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6549 + }, + { + "epoch": 5.231629392971246, + "grad_norm": 0.09947814792394638, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6550 + }, + { + "epoch": 5.232428115015974, + "grad_norm": 0.10113594681024551, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6551 + }, + { + "epoch": 5.233226837060703, + "grad_norm": 0.12645265460014343, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6552 + }, + { + "epoch": 5.234025559105431, + "grad_norm": 0.06775741279125214, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6553 + }, + { + "epoch": 5.23482428115016, + "grad_norm": 0.09799529612064362, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6554 + }, + { + "epoch": 5.235623003194888, + "grad_norm": 0.13129538297653198, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6555 + }, + { + "epoch": 5.236421725239617, + "grad_norm": 0.10139735788106918, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6556 + }, + { + "epoch": 5.237220447284345, + "grad_norm": 0.13819058239459991, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6557 + }, + { + "epoch": 5.238019169329074, + "grad_norm": 0.09306512027978897, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6558 + }, + { + "epoch": 5.2388178913738015, + "grad_norm": 0.07963602244853973, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6559 + }, + { + "epoch": 5.23961661341853, + "grad_norm": 0.12864448130130768, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6560 + }, + { + "epoch": 5.2404153354632586, + "grad_norm": 0.1044403612613678, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6561 + }, + { + "epoch": 5.241214057507987, + "grad_norm": 0.07623843848705292, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6562 + }, + { + "epoch": 5.242012779552716, + "grad_norm": 0.10385097563266754, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 6563 + }, + { + "epoch": 5.242811501597444, + "grad_norm": 0.07048188149929047, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6564 + }, + { + "epoch": 5.243610223642173, + "grad_norm": 0.25789955258369446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6565 + }, + { + "epoch": 5.244408945686901, + "grad_norm": 0.12271685153245926, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6566 + }, + { + "epoch": 5.24520766773163, + "grad_norm": 0.10512058436870575, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 6567 + }, + { + "epoch": 5.246006389776358, + "grad_norm": 0.07663438469171524, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6568 + }, + { + "epoch": 5.246805111821086, + "grad_norm": 0.09937599301338196, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6569 + }, + { + "epoch": 5.247603833865814, + "grad_norm": 0.12242338061332703, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6570 + }, + { + "epoch": 5.248402555910543, + "grad_norm": 0.1733475625514984, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6571 + }, + { + "epoch": 5.2492012779552715, + "grad_norm": 0.1460944414138794, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6572 + }, + { + "epoch": 5.25, + "grad_norm": 0.09406521171331406, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6573 + }, + { + "epoch": 5.2507987220447285, + "grad_norm": 1.0146688222885132, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6574 + }, + { + "epoch": 5.251597444089457, + "grad_norm": 0.10557705909013748, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6575 + }, + { + "epoch": 5.252396166134186, + "grad_norm": 0.1306990385055542, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6576 + }, + { + "epoch": 5.253194888178914, + "grad_norm": 0.094961017370224, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6577 + }, + { + "epoch": 5.253993610223642, + "grad_norm": 0.13421863317489624, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 6578 + }, + { + "epoch": 5.25479233226837, + "grad_norm": 0.12371776252985, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6579 + }, + { + "epoch": 5.255591054313099, + "grad_norm": 0.15863509476184845, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6580 + }, + { + "epoch": 5.256389776357827, + "grad_norm": 0.1156599149107933, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6581 + }, + { + "epoch": 5.257188498402556, + "grad_norm": 0.07102219015359879, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6582 + }, + { + "epoch": 5.257987220447284, + "grad_norm": 0.09030039608478546, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6583 + }, + { + "epoch": 5.258785942492013, + "grad_norm": 0.08848102390766144, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6584 + }, + { + "epoch": 5.2595846645367414, + "grad_norm": 0.07455430924892426, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6585 + }, + { + "epoch": 5.26038338658147, + "grad_norm": 0.07729559391736984, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6586 + }, + { + "epoch": 5.261182108626198, + "grad_norm": 0.0955357626080513, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 6587 + }, + { + "epoch": 5.261980830670926, + "grad_norm": 0.08680911362171173, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6588 + }, + { + "epoch": 5.262779552715655, + "grad_norm": 0.1033414825797081, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6589 + }, + { + "epoch": 5.263578274760383, + "grad_norm": 0.09428979456424713, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6590 + }, + { + "epoch": 5.264376996805112, + "grad_norm": 0.07567942887544632, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6591 + }, + { + "epoch": 5.26517571884984, + "grad_norm": 0.221647247672081, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6592 + }, + { + "epoch": 5.265974440894569, + "grad_norm": 0.13839758932590485, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6593 + }, + { + "epoch": 5.266773162939297, + "grad_norm": 0.06060291454195976, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6594 + }, + { + "epoch": 5.267571884984026, + "grad_norm": 0.09146185964345932, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6595 + }, + { + "epoch": 5.268370607028754, + "grad_norm": 0.05557526275515556, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6596 + }, + { + "epoch": 5.269169329073483, + "grad_norm": 0.10190495103597641, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6597 + }, + { + "epoch": 5.2699680511182105, + "grad_norm": 0.07389659434556961, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6598 + }, + { + "epoch": 5.270766773162939, + "grad_norm": 0.11124115437269211, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6599 + }, + { + "epoch": 5.271565495207668, + "grad_norm": 0.10779515653848648, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 6600 + }, + { + "epoch": 5.272364217252396, + "grad_norm": 0.09347773343324661, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6601 + }, + { + "epoch": 5.273162939297125, + "grad_norm": 0.15056683123111725, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6602 + }, + { + "epoch": 5.273961661341853, + "grad_norm": 0.1398572027683258, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6603 + }, + { + "epoch": 5.274760383386582, + "grad_norm": 0.08360682427883148, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6604 + }, + { + "epoch": 5.27555910543131, + "grad_norm": 0.10360747575759888, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6605 + }, + { + "epoch": 5.276357827476039, + "grad_norm": 0.0864897072315216, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6606 + }, + { + "epoch": 5.277156549520766, + "grad_norm": 0.11505412310361862, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6607 + }, + { + "epoch": 5.277955271565495, + "grad_norm": 0.10638110339641571, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6608 + }, + { + "epoch": 5.2787539936102235, + "grad_norm": 0.08349479734897614, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6609 + }, + { + "epoch": 5.279552715654952, + "grad_norm": 0.14465951919555664, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 6610 + }, + { + "epoch": 5.2803514376996805, + "grad_norm": 0.08049577474594116, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6611 + }, + { + "epoch": 5.281150159744409, + "grad_norm": 0.10206092149019241, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6612 + }, + { + "epoch": 5.281948881789138, + "grad_norm": 0.2721571922302246, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6613 + }, + { + "epoch": 5.282747603833866, + "grad_norm": 0.17503346502780914, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6614 + }, + { + "epoch": 5.283546325878595, + "grad_norm": 0.11459292471408844, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6615 + }, + { + "epoch": 5.284345047923322, + "grad_norm": 0.9974967241287231, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6616 + }, + { + "epoch": 5.285143769968051, + "grad_norm": 0.11502816528081894, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6617 + }, + { + "epoch": 5.285942492012779, + "grad_norm": 0.12992256879806519, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6618 + }, + { + "epoch": 5.286741214057508, + "grad_norm": 0.19872024655342102, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6619 + }, + { + "epoch": 5.287539936102236, + "grad_norm": 0.13013097643852234, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6620 + }, + { + "epoch": 5.288338658146965, + "grad_norm": 0.13644525408744812, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6621 + }, + { + "epoch": 5.289137380191693, + "grad_norm": 0.15101996064186096, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6622 + }, + { + "epoch": 5.289936102236422, + "grad_norm": 0.11075131595134735, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6623 + }, + { + "epoch": 5.2907348242811505, + "grad_norm": 0.0904511958360672, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6624 + }, + { + "epoch": 5.291533546325878, + "grad_norm": 0.08861460536718369, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6625 + }, + { + "epoch": 5.292332268370607, + "grad_norm": 0.10443824529647827, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6626 + }, + { + "epoch": 5.293130990415335, + "grad_norm": 0.07440674304962158, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6627 + }, + { + "epoch": 5.293929712460064, + "grad_norm": 0.21709975600242615, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6628 + }, + { + "epoch": 5.294728434504792, + "grad_norm": 0.1281055063009262, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 6629 + }, + { + "epoch": 5.295527156549521, + "grad_norm": 0.10365202277898788, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6630 + }, + { + "epoch": 5.296325878594249, + "grad_norm": 1.004258632659912, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6631 + }, + { + "epoch": 5.297124600638978, + "grad_norm": 0.16660870611667633, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6632 + }, + { + "epoch": 5.297923322683706, + "grad_norm": 0.1146734207868576, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6633 + }, + { + "epoch": 5.298722044728435, + "grad_norm": 0.18288104236125946, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6634 + }, + { + "epoch": 5.2995207667731625, + "grad_norm": 0.11469347029924393, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6635 + }, + { + "epoch": 5.300319488817891, + "grad_norm": 0.1333407461643219, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6636 + }, + { + "epoch": 5.30111821086262, + "grad_norm": 0.15359243750572205, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6637 + }, + { + "epoch": 5.301916932907348, + "grad_norm": 0.0832027792930603, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6638 + }, + { + "epoch": 5.302715654952077, + "grad_norm": 0.10231718420982361, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6639 + }, + { + "epoch": 5.303514376996805, + "grad_norm": 0.11031626909971237, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6640 + }, + { + "epoch": 5.304313099041534, + "grad_norm": 0.08014792948961258, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6641 + }, + { + "epoch": 5.305111821086262, + "grad_norm": 0.10066475719213486, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6642 + }, + { + "epoch": 5.305910543130991, + "grad_norm": 0.12824396789073944, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 6643 + }, + { + "epoch": 5.306709265175719, + "grad_norm": 0.09452345222234726, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6644 + }, + { + "epoch": 5.307507987220447, + "grad_norm": 0.09100557118654251, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6645 + }, + { + "epoch": 5.3083067092651754, + "grad_norm": 0.07995713502168655, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6646 + }, + { + "epoch": 5.309105431309904, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6647 + }, + { + "epoch": 5.3099041533546325, + "grad_norm": 0.09881234914064407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6648 + }, + { + "epoch": 5.310702875399361, + "grad_norm": 0.08131393790245056, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 6649 + }, + { + "epoch": 5.31150159744409, + "grad_norm": 0.08842889964580536, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6650 + }, + { + "epoch": 5.312300319488818, + "grad_norm": 0.12630115449428558, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6651 + }, + { + "epoch": 5.313099041533547, + "grad_norm": 0.13429711759090424, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6652 + }, + { + "epoch": 5.313897763578275, + "grad_norm": 0.11347261816263199, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6653 + }, + { + "epoch": 5.314696485623003, + "grad_norm": 0.1555728167295456, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6654 + }, + { + "epoch": 5.315495207667731, + "grad_norm": 0.13184282183647156, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6655 + }, + { + "epoch": 5.31629392971246, + "grad_norm": 0.07821093499660492, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 6656 + }, + { + "epoch": 5.317092651757188, + "grad_norm": 0.1300499588251114, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6657 + }, + { + "epoch": 5.317891373801917, + "grad_norm": 0.14896781742572784, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6658 + }, + { + "epoch": 5.318690095846645, + "grad_norm": 0.13370175659656525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6659 + }, + { + "epoch": 5.319488817891374, + "grad_norm": 0.14055652916431427, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6660 + }, + { + "epoch": 5.3202875399361025, + "grad_norm": 0.11674464493989944, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6661 + }, + { + "epoch": 5.321086261980831, + "grad_norm": 0.13155756890773773, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6662 + }, + { + "epoch": 5.321884984025559, + "grad_norm": 0.09616535156965256, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6663 + }, + { + "epoch": 5.322683706070287, + "grad_norm": 0.4228188991546631, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6664 + }, + { + "epoch": 5.323482428115016, + "grad_norm": 0.10942913591861725, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 6665 + }, + { + "epoch": 5.324281150159744, + "grad_norm": 0.15592730045318604, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6666 + }, + { + "epoch": 5.325079872204473, + "grad_norm": 0.16837753355503082, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 6667 + }, + { + "epoch": 5.325878594249201, + "grad_norm": 0.10512012243270874, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 6668 + }, + { + "epoch": 5.32667731629393, + "grad_norm": 0.10834471136331558, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6669 + }, + { + "epoch": 5.327476038338658, + "grad_norm": 0.06588451564311981, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6670 + }, + { + "epoch": 5.328274760383387, + "grad_norm": 0.08714822679758072, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6671 + }, + { + "epoch": 5.329073482428115, + "grad_norm": 0.16129685938358307, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 6672 + }, + { + "epoch": 5.329872204472843, + "grad_norm": 0.09294751286506653, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6673 + }, + { + "epoch": 5.330670926517572, + "grad_norm": 0.09905052185058594, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6674 + }, + { + "epoch": 5.3314696485623, + "grad_norm": 0.14584603905677795, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6675 + }, + { + "epoch": 5.332268370607029, + "grad_norm": 0.08384378254413605, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6676 + }, + { + "epoch": 5.333067092651757, + "grad_norm": 0.1672045886516571, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6677 + }, + { + "epoch": 5.333865814696486, + "grad_norm": 0.21656489372253418, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6678 + }, + { + "epoch": 5.334664536741214, + "grad_norm": 0.17034684121608734, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6679 + }, + { + "epoch": 5.335463258785943, + "grad_norm": 0.3153417408466339, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6680 + }, + { + "epoch": 5.336261980830671, + "grad_norm": 0.1953393816947937, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6681 + }, + { + "epoch": 5.3370607028754, + "grad_norm": 0.2085847705602646, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6682 + }, + { + "epoch": 5.337859424920127, + "grad_norm": 0.2679558992385864, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6683 + }, + { + "epoch": 5.338658146964856, + "grad_norm": 0.08705966919660568, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6684 + }, + { + "epoch": 5.3394568690095845, + "grad_norm": 0.09011410176753998, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6685 + }, + { + "epoch": 5.340255591054313, + "grad_norm": 0.10358326137065887, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6686 + }, + { + "epoch": 5.3410543130990416, + "grad_norm": 0.08191518485546112, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6687 + }, + { + "epoch": 5.34185303514377, + "grad_norm": 0.0676165446639061, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6688 + }, + { + "epoch": 5.342651757188499, + "grad_norm": 0.18006695806980133, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6689 + }, + { + "epoch": 5.343450479233227, + "grad_norm": 0.11935598403215408, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6690 + }, + { + "epoch": 5.344249201277956, + "grad_norm": 0.14136075973510742, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6691 + }, + { + "epoch": 5.345047923322683, + "grad_norm": 0.19367988407611847, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6692 + }, + { + "epoch": 5.345846645367412, + "grad_norm": 0.1283622533082962, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6693 + }, + { + "epoch": 5.34664536741214, + "grad_norm": 0.11303326487541199, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6694 + }, + { + "epoch": 5.347444089456869, + "grad_norm": 0.09076731652021408, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 6695 + }, + { + "epoch": 5.348242811501597, + "grad_norm": 0.12625159323215485, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6696 + }, + { + "epoch": 5.349041533546326, + "grad_norm": 0.18254370987415314, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 6697 + }, + { + "epoch": 5.3498402555910545, + "grad_norm": 0.12221173942089081, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6698 + }, + { + "epoch": 5.350638977635783, + "grad_norm": 0.11586996912956238, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6699 + }, + { + "epoch": 5.3514376996805115, + "grad_norm": 0.1012619286775589, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6700 + }, + { + "epoch": 5.352236421725239, + "grad_norm": 0.10728003084659576, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 6701 + }, + { + "epoch": 5.353035143769968, + "grad_norm": 0.08077894896268845, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 6702 + }, + { + "epoch": 5.353833865814696, + "grad_norm": 0.10069102048873901, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6703 + }, + { + "epoch": 5.354632587859425, + "grad_norm": 0.11007717996835709, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6704 + }, + { + "epoch": 5.355431309904153, + "grad_norm": 0.08088147640228271, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6705 + }, + { + "epoch": 5.356230031948882, + "grad_norm": 0.06969337165355682, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6706 + }, + { + "epoch": 5.35702875399361, + "grad_norm": 0.09731647372245789, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6707 + }, + { + "epoch": 5.357827476038339, + "grad_norm": 0.07404995709657669, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6708 + }, + { + "epoch": 5.358626198083067, + "grad_norm": 0.09361755102872849, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 6709 + }, + { + "epoch": 5.359424920127796, + "grad_norm": 0.11929210275411606, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6710 + }, + { + "epoch": 5.360223642172524, + "grad_norm": 0.11107892543077469, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6711 + }, + { + "epoch": 5.361022364217252, + "grad_norm": 0.10966535657644272, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6712 + }, + { + "epoch": 5.361821086261981, + "grad_norm": 0.11830565333366394, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6713 + }, + { + "epoch": 5.362619808306709, + "grad_norm": 0.15130563080310822, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6714 + }, + { + "epoch": 5.363418530351438, + "grad_norm": 0.12608309090137482, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6715 + }, + { + "epoch": 5.364217252396166, + "grad_norm": 0.10768693685531616, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6716 + }, + { + "epoch": 5.365015974440895, + "grad_norm": 0.10020256787538528, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6717 + }, + { + "epoch": 5.365814696485623, + "grad_norm": 0.11352406442165375, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6718 + }, + { + "epoch": 5.366613418530352, + "grad_norm": 0.10058535635471344, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6719 + }, + { + "epoch": 5.36741214057508, + "grad_norm": 0.08427922427654266, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6720 + }, + { + "epoch": 5.368210862619808, + "grad_norm": 0.08600196242332458, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 6721 + }, + { + "epoch": 5.3690095846645365, + "grad_norm": 0.0891844630241394, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 6722 + }, + { + "epoch": 5.369808306709265, + "grad_norm": 0.07231339812278748, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6723 + }, + { + "epoch": 5.3706070287539935, + "grad_norm": 0.0866503193974495, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6724 + }, + { + "epoch": 5.371405750798722, + "grad_norm": 0.44905656576156616, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 6725 + }, + { + "epoch": 5.372204472843451, + "grad_norm": 0.2192242592573166, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 6726 + }, + { + "epoch": 5.373003194888179, + "grad_norm": 0.15841859579086304, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 6727 + }, + { + "epoch": 5.373801916932908, + "grad_norm": 0.1254468858242035, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6728 + }, + { + "epoch": 5.374600638977636, + "grad_norm": 1.5675911903381348, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6729 + }, + { + "epoch": 5.375399361022364, + "grad_norm": 0.20507164299488068, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6730 + }, + { + "epoch": 5.376198083067092, + "grad_norm": 0.26948630809783936, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 6731 + }, + { + "epoch": 5.376996805111821, + "grad_norm": 0.15447315573692322, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 6732 + }, + { + "epoch": 5.377795527156549, + "grad_norm": 0.17888243496418, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 6733 + }, + { + "epoch": 5.378594249201278, + "grad_norm": 0.24683290719985962, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6734 + }, + { + "epoch": 5.3793929712460065, + "grad_norm": 0.15786881744861603, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6735 + }, + { + "epoch": 5.380191693290735, + "grad_norm": 0.18426702916622162, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6736 + }, + { + "epoch": 5.3809904153354635, + "grad_norm": 0.14444448053836823, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 6737 + }, + { + "epoch": 5.381789137380192, + "grad_norm": 0.135011225938797, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6738 + }, + { + "epoch": 5.38258785942492, + "grad_norm": 0.19057826697826385, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 6739 + }, + { + "epoch": 5.383386581469648, + "grad_norm": 0.12282486259937286, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6740 + }, + { + "epoch": 5.384185303514377, + "grad_norm": 0.17092294991016388, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 6741 + }, + { + "epoch": 5.384984025559105, + "grad_norm": 0.19800473749637604, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6742 + }, + { + "epoch": 5.385782747603834, + "grad_norm": 0.07987766712903976, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6743 + }, + { + "epoch": 5.386581469648562, + "grad_norm": 0.18386386334896088, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6744 + }, + { + "epoch": 5.387380191693291, + "grad_norm": 0.16529197990894318, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6745 + }, + { + "epoch": 5.388178913738019, + "grad_norm": 0.09607496112585068, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6746 + }, + { + "epoch": 5.388977635782748, + "grad_norm": 0.15966713428497314, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 6747 + }, + { + "epoch": 5.389776357827476, + "grad_norm": 0.1622796356678009, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6748 + }, + { + "epoch": 5.390575079872204, + "grad_norm": 0.09537432342767715, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6749 + }, + { + "epoch": 5.391373801916933, + "grad_norm": 0.1766965389251709, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6750 + }, + { + "epoch": 5.392172523961661, + "grad_norm": 0.21354711055755615, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6751 + }, + { + "epoch": 5.39297124600639, + "grad_norm": 0.093564473092556, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 6752 + }, + { + "epoch": 5.393769968051118, + "grad_norm": 0.14756347239017487, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6753 + }, + { + "epoch": 5.394568690095847, + "grad_norm": 0.10537468641996384, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 6754 + }, + { + "epoch": 5.395367412140575, + "grad_norm": 0.15626567602157593, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6755 + }, + { + "epoch": 5.396166134185304, + "grad_norm": 0.16282637417316437, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6756 + }, + { + "epoch": 5.396964856230032, + "grad_norm": 0.0745241791009903, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6757 + }, + { + "epoch": 5.397763578274761, + "grad_norm": 0.1221894845366478, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6758 + }, + { + "epoch": 5.3985623003194885, + "grad_norm": 0.08314131945371628, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6759 + }, + { + "epoch": 5.399361022364217, + "grad_norm": 0.12707264721393585, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6760 + }, + { + "epoch": 5.4001597444089455, + "grad_norm": 0.12036006152629852, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6761 + }, + { + "epoch": 5.400958466453674, + "grad_norm": 0.12769176065921783, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6762 + }, + { + "epoch": 5.401757188498403, + "grad_norm": 0.2201661318540573, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6763 + }, + { + "epoch": 5.402555910543131, + "grad_norm": 0.15013982355594635, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6764 + }, + { + "epoch": 5.40335463258786, + "grad_norm": 0.7714766263961792, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6765 + }, + { + "epoch": 5.404153354632588, + "grad_norm": 0.20359933376312256, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6766 + }, + { + "epoch": 5.404952076677317, + "grad_norm": 0.12684984505176544, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6767 + }, + { + "epoch": 5.405750798722044, + "grad_norm": 0.09804195165634155, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 6768 + }, + { + "epoch": 5.406549520766773, + "grad_norm": 0.10416880995035172, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 6769 + }, + { + "epoch": 5.407348242811501, + "grad_norm": 0.1509416699409485, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6770 + }, + { + "epoch": 5.40814696485623, + "grad_norm": 0.15458443760871887, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 6771 + }, + { + "epoch": 5.4089456869009584, + "grad_norm": 0.08355830609798431, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 6772 + }, + { + "epoch": 5.409744408945687, + "grad_norm": 0.1228979080915451, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 6773 + }, + { + "epoch": 5.4105431309904155, + "grad_norm": 0.12139632552862167, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6774 + }, + { + "epoch": 5.411341853035144, + "grad_norm": 0.16298502683639526, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6775 + }, + { + "epoch": 5.412140575079873, + "grad_norm": 0.09110788255929947, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 6776 + }, + { + "epoch": 5.4129392971246, + "grad_norm": 0.08584781736135483, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 6777 + }, + { + "epoch": 5.413738019169329, + "grad_norm": 0.10148828476667404, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6778 + }, + { + "epoch": 5.414536741214057, + "grad_norm": 0.1046212688088417, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 6779 + }, + { + "epoch": 5.415335463258786, + "grad_norm": 0.12530827522277832, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 6780 + }, + { + "epoch": 5.416134185303514, + "grad_norm": 0.07337464392185211, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6781 + }, + { + "epoch": 5.416932907348243, + "grad_norm": 0.10839185118675232, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6782 + }, + { + "epoch": 5.417731629392971, + "grad_norm": 0.07784926891326904, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6783 + }, + { + "epoch": 5.4185303514377, + "grad_norm": 0.08692190796136856, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6784 + }, + { + "epoch": 5.419329073482428, + "grad_norm": 0.08721921592950821, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6785 + }, + { + "epoch": 5.420127795527157, + "grad_norm": 0.09581280499696732, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 6786 + }, + { + "epoch": 5.420926517571885, + "grad_norm": 0.1156916618347168, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6787 + }, + { + "epoch": 5.421725239616613, + "grad_norm": 0.4520327150821686, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6788 + }, + { + "epoch": 5.422523961661342, + "grad_norm": 0.0948205217719078, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 6789 + }, + { + "epoch": 5.42332268370607, + "grad_norm": 0.07208927720785141, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6790 + }, + { + "epoch": 5.424121405750799, + "grad_norm": 0.06830724328756332, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6791 + }, + { + "epoch": 5.424920127795527, + "grad_norm": 0.10488666594028473, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6792 + }, + { + "epoch": 5.425718849840256, + "grad_norm": 0.08509235084056854, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6793 + }, + { + "epoch": 5.426517571884984, + "grad_norm": 0.09133832901716232, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 6794 + }, + { + "epoch": 5.427316293929713, + "grad_norm": 0.11715687066316605, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 6795 + }, + { + "epoch": 5.428115015974441, + "grad_norm": 0.1196032389998436, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 6796 + }, + { + "epoch": 5.428913738019169, + "grad_norm": 0.14141549170017242, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 6797 + }, + { + "epoch": 5.4297124600638975, + "grad_norm": 0.12866206467151642, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6798 + }, + { + "epoch": 5.430511182108626, + "grad_norm": 0.10802716016769409, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6799 + }, + { + "epoch": 5.431309904153355, + "grad_norm": 0.10947239398956299, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6800 + }, + { + "epoch": 5.432108626198083, + "grad_norm": 0.08339721709489822, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6801 + }, + { + "epoch": 5.432907348242812, + "grad_norm": 0.12407296150922775, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 6802 + }, + { + "epoch": 5.43370607028754, + "grad_norm": 0.10537894070148468, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6803 + }, + { + "epoch": 5.434504792332269, + "grad_norm": 0.0920059084892273, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6804 + }, + { + "epoch": 5.435303514376997, + "grad_norm": 0.1502516269683838, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6805 + }, + { + "epoch": 5.436102236421725, + "grad_norm": 0.2798864245414734, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 6806 + }, + { + "epoch": 5.436900958466453, + "grad_norm": 0.11037585884332657, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6807 + }, + { + "epoch": 5.437699680511182, + "grad_norm": 0.12594881653785706, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6808 + }, + { + "epoch": 5.43849840255591, + "grad_norm": 0.09976109862327576, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6809 + }, + { + "epoch": 5.439297124600639, + "grad_norm": 0.3285512328147888, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 6810 + }, + { + "epoch": 5.4400958466453675, + "grad_norm": 0.49450287222862244, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6811 + }, + { + "epoch": 5.440894568690096, + "grad_norm": 0.06817556917667389, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6812 + }, + { + "epoch": 5.4416932907348246, + "grad_norm": 0.14917057752609253, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6813 + }, + { + "epoch": 5.442492012779553, + "grad_norm": 0.10008134692907333, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6814 + }, + { + "epoch": 5.443290734824281, + "grad_norm": 0.07854767143726349, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 6815 + }, + { + "epoch": 5.444089456869009, + "grad_norm": 0.2441248893737793, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6816 + }, + { + "epoch": 5.444888178913738, + "grad_norm": 0.1276157647371292, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6817 + }, + { + "epoch": 5.445686900958466, + "grad_norm": 0.11779431253671646, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 6818 + }, + { + "epoch": 5.446485623003195, + "grad_norm": 0.11788108944892883, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6819 + }, + { + "epoch": 5.447284345047923, + "grad_norm": 0.06554995477199554, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6820 + }, + { + "epoch": 5.448083067092652, + "grad_norm": 0.07937108725309372, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6821 + }, + { + "epoch": 5.44888178913738, + "grad_norm": 0.08041426539421082, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 6822 + }, + { + "epoch": 5.449680511182109, + "grad_norm": 0.12429161369800568, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6823 + }, + { + "epoch": 5.4504792332268375, + "grad_norm": 0.09993165731430054, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6824 + }, + { + "epoch": 5.451277955271565, + "grad_norm": 0.07077670097351074, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 6825 + }, + { + "epoch": 5.452076677316294, + "grad_norm": 0.12163005024194717, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6826 + }, + { + "epoch": 5.452875399361022, + "grad_norm": 0.19080819189548492, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6827 + }, + { + "epoch": 5.453674121405751, + "grad_norm": 0.06450853496789932, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6828 + }, + { + "epoch": 5.454472843450479, + "grad_norm": 0.8893078565597534, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 6829 + }, + { + "epoch": 5.455271565495208, + "grad_norm": 0.08225185424089432, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6830 + }, + { + "epoch": 5.456070287539936, + "grad_norm": 0.08631845563650131, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 6831 + }, + { + "epoch": 5.456869009584665, + "grad_norm": 0.1858949214220047, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 6832 + }, + { + "epoch": 5.457667731629393, + "grad_norm": 0.10997786372900009, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 6833 + }, + { + "epoch": 5.458466453674122, + "grad_norm": 0.09691416472196579, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6834 + }, + { + "epoch": 5.4592651757188495, + "grad_norm": 0.12523561716079712, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6835 + }, + { + "epoch": 5.460063897763578, + "grad_norm": 0.10094364732503891, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6836 + }, + { + "epoch": 5.460862619808307, + "grad_norm": 0.06598310172557831, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 6837 + }, + { + "epoch": 5.461661341853035, + "grad_norm": 0.10221479833126068, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6838 + }, + { + "epoch": 5.462460063897764, + "grad_norm": 0.6545975804328918, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6839 + }, + { + "epoch": 5.463258785942492, + "grad_norm": 0.12167128920555115, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6840 + }, + { + "epoch": 5.464057507987221, + "grad_norm": 0.10822924226522446, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6841 + }, + { + "epoch": 5.464856230031949, + "grad_norm": 0.11905575543642044, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6842 + }, + { + "epoch": 5.465654952076678, + "grad_norm": 0.10276103764772415, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 6843 + }, + { + "epoch": 5.466453674121405, + "grad_norm": 0.09087378531694412, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6844 + }, + { + "epoch": 5.467252396166134, + "grad_norm": 0.13117510080337524, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6845 + }, + { + "epoch": 5.468051118210862, + "grad_norm": 0.14824305474758148, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6846 + }, + { + "epoch": 5.468849840255591, + "grad_norm": 0.08553508669137955, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 6847 + }, + { + "epoch": 5.4696485623003195, + "grad_norm": 0.12209141999483109, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6848 + }, + { + "epoch": 5.470447284345048, + "grad_norm": 0.1992058902978897, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6849 + }, + { + "epoch": 5.4712460063897765, + "grad_norm": 0.08518865704536438, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6850 + }, + { + "epoch": 5.472044728434505, + "grad_norm": 0.10496464371681213, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 6851 + }, + { + "epoch": 5.472843450479234, + "grad_norm": 0.08789866417646408, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 6852 + }, + { + "epoch": 5.473642172523961, + "grad_norm": 0.08592598885297775, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6853 + }, + { + "epoch": 5.47444089456869, + "grad_norm": 0.061165813356637955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6854 + }, + { + "epoch": 5.475239616613418, + "grad_norm": 0.06936467438936234, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 6855 + }, + { + "epoch": 5.476038338658147, + "grad_norm": 0.20519734919071198, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 6856 + }, + { + "epoch": 5.476837060702875, + "grad_norm": 0.087073415517807, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6857 + }, + { + "epoch": 5.477635782747604, + "grad_norm": 0.10153642296791077, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6858 + }, + { + "epoch": 5.478434504792332, + "grad_norm": 0.12416163831949234, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6859 + }, + { + "epoch": 5.479233226837061, + "grad_norm": 0.1047174334526062, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6860 + }, + { + "epoch": 5.4800319488817895, + "grad_norm": 0.13690868020057678, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6861 + }, + { + "epoch": 5.480830670926517, + "grad_norm": 0.15995970368385315, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 6862 + }, + { + "epoch": 5.481629392971246, + "grad_norm": 0.08172900229692459, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 6863 + }, + { + "epoch": 5.482428115015974, + "grad_norm": 0.10956761986017227, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 6864 + }, + { + "epoch": 5.483226837060703, + "grad_norm": 0.12259931862354279, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 6865 + }, + { + "epoch": 5.484025559105431, + "grad_norm": 0.08295698463916779, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 6866 + }, + { + "epoch": 5.48482428115016, + "grad_norm": 0.10935505479574203, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 6867 + }, + { + "epoch": 5.485623003194888, + "grad_norm": 0.12436006963253021, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 6868 + }, + { + "epoch": 5.486421725239617, + "grad_norm": 0.08449307829141617, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6869 + }, + { + "epoch": 5.487220447284345, + "grad_norm": 0.10897113382816315, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6870 + }, + { + "epoch": 5.488019169329074, + "grad_norm": 0.06856910139322281, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 6871 + }, + { + "epoch": 5.488817891373802, + "grad_norm": 0.07105988264083862, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6872 + }, + { + "epoch": 5.48961661341853, + "grad_norm": 0.08778723329305649, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6873 + }, + { + "epoch": 5.4904153354632586, + "grad_norm": 0.07818275690078735, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6874 + }, + { + "epoch": 5.491214057507987, + "grad_norm": 0.08410139381885529, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 6875 + }, + { + "epoch": 5.492012779552716, + "grad_norm": 0.0804608166217804, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6876 + }, + { + "epoch": 5.492811501597444, + "grad_norm": 0.10089578479528427, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 6877 + }, + { + "epoch": 5.493610223642173, + "grad_norm": 0.08231056481599808, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6878 + }, + { + "epoch": 5.494408945686901, + "grad_norm": 0.07642059773206711, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 6879 + }, + { + "epoch": 5.49520766773163, + "grad_norm": 0.11312755942344666, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6880 + }, + { + "epoch": 5.496006389776358, + "grad_norm": 0.06288543343544006, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6881 + }, + { + "epoch": 5.496805111821086, + "grad_norm": 0.09648934751749039, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6882 + }, + { + "epoch": 5.497603833865814, + "grad_norm": 0.09374719858169556, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6883 + }, + { + "epoch": 5.498402555910543, + "grad_norm": 0.10596928000450134, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 6884 + }, + { + "epoch": 5.4992012779552715, + "grad_norm": 0.06540077924728394, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 6885 + }, + { + "epoch": 5.5, + "grad_norm": 0.05208199843764305, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 6886 + }, + { + "epoch": 5.5007987220447285, + "grad_norm": 0.10762238502502441, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 6887 + }, + { + "epoch": 5.501597444089457, + "grad_norm": 0.122553251683712, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6888 + }, + { + "epoch": 5.502396166134186, + "grad_norm": 0.07663412392139435, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 6889 + }, + { + "epoch": 5.503194888178914, + "grad_norm": 0.09100968390703201, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6890 + }, + { + "epoch": 5.503993610223642, + "grad_norm": 0.24931807816028595, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 6891 + }, + { + "epoch": 5.50479233226837, + "grad_norm": 0.07812821120023727, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 6892 + }, + { + "epoch": 5.505591054313099, + "grad_norm": 0.04760657623410225, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6893 + }, + { + "epoch": 5.506389776357827, + "grad_norm": 0.08183290809392929, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6894 + }, + { + "epoch": 5.507188498402556, + "grad_norm": 0.09541092067956924, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6895 + }, + { + "epoch": 5.507987220447284, + "grad_norm": 0.04168708249926567, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 6896 + }, + { + "epoch": 5.508785942492013, + "grad_norm": 0.07038994133472443, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6897 + }, + { + "epoch": 5.5095846645367414, + "grad_norm": 0.060375142842531204, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 6898 + }, + { + "epoch": 5.51038338658147, + "grad_norm": 0.048829223960638046, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 6899 + }, + { + "epoch": 5.511182108626198, + "grad_norm": 0.057894766330718994, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6900 + }, + { + "epoch": 5.511980830670926, + "grad_norm": 0.05786101892590523, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 6901 + }, + { + "epoch": 5.512779552715655, + "grad_norm": 0.07246953994035721, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 6902 + }, + { + "epoch": 5.513578274760383, + "grad_norm": 0.07493462413549423, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 6903 + }, + { + "epoch": 5.514376996805112, + "grad_norm": 0.060612600296735764, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 6904 + }, + { + "epoch": 5.51517571884984, + "grad_norm": 0.0666302740573883, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6905 + }, + { + "epoch": 5.515974440894569, + "grad_norm": 0.08713024109601974, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6906 + }, + { + "epoch": 5.516773162939297, + "grad_norm": 0.31083860993385315, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6907 + }, + { + "epoch": 5.517571884984026, + "grad_norm": 0.0808933675289154, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6908 + }, + { + "epoch": 5.518370607028754, + "grad_norm": 0.1312016248703003, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 6909 + }, + { + "epoch": 5.519169329073483, + "grad_norm": 0.20448890328407288, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6910 + }, + { + "epoch": 5.5199680511182105, + "grad_norm": 0.2519006133079529, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 6911 + }, + { + "epoch": 5.520766773162939, + "grad_norm": 0.11359903216362, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 6912 + }, + { + "epoch": 5.521565495207668, + "grad_norm": 0.07498760521411896, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 6913 + }, + { + "epoch": 5.522364217252396, + "grad_norm": 0.06599561125040054, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 6914 + }, + { + "epoch": 5.523162939297125, + "grad_norm": 0.08988697826862335, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 6915 + }, + { + "epoch": 5.523961661341853, + "grad_norm": 0.06968241930007935, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 6916 + }, + { + "epoch": 5.524760383386582, + "grad_norm": 0.07231415063142776, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 6917 + }, + { + "epoch": 5.52555910543131, + "grad_norm": 0.07369428128004074, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 6918 + }, + { + "epoch": 5.526357827476039, + "grad_norm": 0.07677069306373596, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 6919 + }, + { + "epoch": 5.527156549520766, + "grad_norm": 0.07391869276762009, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 6920 + }, + { + "epoch": 5.527955271565495, + "grad_norm": 0.05270293354988098, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6921 + }, + { + "epoch": 5.5287539936102235, + "grad_norm": 0.10439106076955795, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 6922 + }, + { + "epoch": 5.529552715654952, + "grad_norm": 0.06968904286623001, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 6923 + }, + { + "epoch": 5.5303514376996805, + "grad_norm": 0.08401032537221909, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 6924 + }, + { + "epoch": 5.531150159744409, + "grad_norm": 0.11993245035409927, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 6925 + }, + { + "epoch": 5.531948881789138, + "grad_norm": 0.05857640504837036, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 6926 + }, + { + "epoch": 5.532747603833866, + "grad_norm": 0.10513442009687424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 6927 + }, + { + "epoch": 5.533546325878595, + "grad_norm": 0.12233056873083115, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6928 + }, + { + "epoch": 5.534345047923322, + "grad_norm": 0.06959997117519379, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 6929 + }, + { + "epoch": 5.535143769968051, + "grad_norm": 0.08057182282209396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 6930 + }, + { + "epoch": 5.535942492012779, + "grad_norm": 0.09816458821296692, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 6931 + }, + { + "epoch": 5.536741214057508, + "grad_norm": 0.055738940834999084, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 6932 + }, + { + "epoch": 5.537539936102236, + "grad_norm": 0.0939234122633934, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 6933 + }, + { + "epoch": 5.538338658146965, + "grad_norm": 0.12143029272556305, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6934 + }, + { + "epoch": 5.539137380191693, + "grad_norm": 0.08409210294485092, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 6935 + }, + { + "epoch": 5.539936102236422, + "grad_norm": 0.10690448433160782, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 6936 + }, + { + "epoch": 5.5407348242811505, + "grad_norm": 0.20701836049556732, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 6937 + }, + { + "epoch": 5.541533546325878, + "grad_norm": 0.09124163538217545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6938 + }, + { + "epoch": 5.542332268370607, + "grad_norm": 0.08295103162527084, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 6939 + }, + { + "epoch": 5.543130990415335, + "grad_norm": 0.1179230809211731, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 6940 + }, + { + "epoch": 5.543929712460064, + "grad_norm": 0.12345689535140991, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6941 + }, + { + "epoch": 5.544728434504792, + "grad_norm": 0.052616000175476074, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 6942 + }, + { + "epoch": 5.545527156549521, + "grad_norm": 0.07918131351470947, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 6943 + }, + { + "epoch": 5.546325878594249, + "grad_norm": 0.04847119748592377, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 6944 + }, + { + "epoch": 5.547124600638978, + "grad_norm": 0.06204143166542053, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6945 + }, + { + "epoch": 5.547923322683706, + "grad_norm": 0.07778293639421463, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 6946 + }, + { + "epoch": 5.548722044728435, + "grad_norm": 0.05037623643875122, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 6947 + }, + { + "epoch": 5.549520766773163, + "grad_norm": 0.09024710208177567, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 6948 + }, + { + "epoch": 5.550319488817891, + "grad_norm": 0.0872211754322052, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 6949 + }, + { + "epoch": 5.55111821086262, + "grad_norm": 0.08456625789403915, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 6950 + }, + { + "epoch": 5.551916932907348, + "grad_norm": 0.054692018777132034, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 6951 + }, + { + "epoch": 5.552715654952077, + "grad_norm": 0.10690787434577942, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6952 + }, + { + "epoch": 5.553514376996805, + "grad_norm": 0.07764400541782379, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 6953 + }, + { + "epoch": 5.554313099041534, + "grad_norm": 0.08423051983118057, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 6954 + }, + { + "epoch": 5.555111821086262, + "grad_norm": 0.06771727651357651, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6955 + }, + { + "epoch": 5.555910543130991, + "grad_norm": 0.10505887866020203, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6956 + }, + { + "epoch": 5.556709265175719, + "grad_norm": 0.054641906172037125, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6957 + }, + { + "epoch": 5.557507987220447, + "grad_norm": 0.05115118622779846, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 6958 + }, + { + "epoch": 5.5583067092651754, + "grad_norm": 0.07177245616912842, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 6959 + }, + { + "epoch": 5.559105431309904, + "grad_norm": 0.06642751395702362, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 6960 + }, + { + "epoch": 5.5599041533546325, + "grad_norm": 0.08428867161273956, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 6961 + }, + { + "epoch": 5.560702875399361, + "grad_norm": 0.044375378638505936, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 6962 + }, + { + "epoch": 5.56150159744409, + "grad_norm": 0.06384986639022827, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 6963 + }, + { + "epoch": 5.562300319488818, + "grad_norm": 0.052885912358760834, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 6964 + }, + { + "epoch": 5.563099041533547, + "grad_norm": 0.05244029313325882, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6965 + }, + { + "epoch": 5.563897763578275, + "grad_norm": 0.1781054139137268, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 6966 + }, + { + "epoch": 5.564696485623003, + "grad_norm": 0.8067191243171692, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 6967 + }, + { + "epoch": 5.565495207667731, + "grad_norm": 0.0759076327085495, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 6968 + }, + { + "epoch": 5.56629392971246, + "grad_norm": 0.0820186585187912, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 6969 + }, + { + "epoch": 5.567092651757188, + "grad_norm": 2.901848316192627, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 6970 + }, + { + "epoch": 5.567891373801917, + "grad_norm": 0.5663259625434875, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6971 + }, + { + "epoch": 5.568690095846645, + "grad_norm": 0.34909728169441223, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 6972 + }, + { + "epoch": 5.569488817891374, + "grad_norm": 0.3031843602657318, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 6973 + }, + { + "epoch": 5.5702875399361025, + "grad_norm": 0.9258882403373718, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 6974 + }, + { + "epoch": 5.571086261980831, + "grad_norm": 0.37162891030311584, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 6975 + }, + { + "epoch": 5.571884984025559, + "grad_norm": 0.11269918829202652, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 6976 + }, + { + "epoch": 5.572683706070287, + "grad_norm": 0.20953021943569183, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 6977 + }, + { + "epoch": 5.573482428115016, + "grad_norm": 0.22324982285499573, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 6978 + }, + { + "epoch": 5.574281150159744, + "grad_norm": 0.47017180919647217, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 6979 + }, + { + "epoch": 5.575079872204473, + "grad_norm": 0.22266747057437897, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 6980 + }, + { + "epoch": 5.575878594249201, + "grad_norm": 0.1609373688697815, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 6981 + }, + { + "epoch": 5.57667731629393, + "grad_norm": 0.17458784580230713, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 6982 + }, + { + "epoch": 5.577476038338658, + "grad_norm": 0.17354144155979156, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 6983 + }, + { + "epoch": 5.578274760383387, + "grad_norm": 0.10959888994693756, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 6984 + }, + { + "epoch": 5.5790734824281145, + "grad_norm": 0.22630754113197327, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6985 + }, + { + "epoch": 5.579872204472844, + "grad_norm": 0.3786774277687073, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6986 + }, + { + "epoch": 5.580670926517572, + "grad_norm": 0.13818539679050446, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 6987 + }, + { + "epoch": 5.5814696485623, + "grad_norm": 0.22202269732952118, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 6988 + }, + { + "epoch": 5.582268370607029, + "grad_norm": 0.08324426412582397, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 6989 + }, + { + "epoch": 5.583067092651757, + "grad_norm": 0.16399513185024261, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 6990 + }, + { + "epoch": 5.583865814696486, + "grad_norm": 0.13956478238105774, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 6991 + }, + { + "epoch": 5.584664536741214, + "grad_norm": 0.09159751981496811, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 6992 + }, + { + "epoch": 5.585463258785943, + "grad_norm": 0.19404387474060059, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 6993 + }, + { + "epoch": 5.586261980830671, + "grad_norm": 0.07866083085536957, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 6994 + }, + { + "epoch": 5.5870607028754, + "grad_norm": 0.10653684288263321, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 6995 + }, + { + "epoch": 5.587859424920127, + "grad_norm": 0.12254250794649124, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 6996 + }, + { + "epoch": 5.588658146964856, + "grad_norm": 0.0665711760520935, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 6997 + }, + { + "epoch": 5.5894568690095845, + "grad_norm": 0.1234782338142395, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 6998 + }, + { + "epoch": 5.590255591054313, + "grad_norm": 0.10345113277435303, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 6999 + }, + { + "epoch": 5.5910543130990416, + "grad_norm": 0.10187766700983047, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7000 + }, + { + "epoch": 5.59185303514377, + "grad_norm": 0.10330864042043686, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7001 + }, + { + "epoch": 5.592651757188499, + "grad_norm": 0.12427254766225815, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7002 + }, + { + "epoch": 5.593450479233227, + "grad_norm": 0.06854265183210373, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7003 + }, + { + "epoch": 5.594249201277956, + "grad_norm": 0.07029487192630768, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7004 + }, + { + "epoch": 5.595047923322683, + "grad_norm": 0.07483061403036118, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7005 + }, + { + "epoch": 5.595846645367412, + "grad_norm": 0.08542168885469437, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 7006 + }, + { + "epoch": 5.59664536741214, + "grad_norm": 0.05537399277091026, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7007 + }, + { + "epoch": 5.597444089456869, + "grad_norm": 0.28531956672668457, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7008 + }, + { + "epoch": 5.598242811501597, + "grad_norm": 0.1349600851535797, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7009 + }, + { + "epoch": 5.599041533546326, + "grad_norm": 0.06000711768865585, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7010 + }, + { + "epoch": 5.5998402555910545, + "grad_norm": 0.08139210939407349, + "learning_rate": 0.0005, + "loss": 1.0677, + "step": 7011 + }, + { + "epoch": 5.600638977635783, + "grad_norm": 0.08603602647781372, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7012 + }, + { + "epoch": 5.6014376996805115, + "grad_norm": 0.06586270034313202, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7013 + }, + { + "epoch": 5.602236421725239, + "grad_norm": 0.06276310235261917, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7014 + }, + { + "epoch": 5.603035143769968, + "grad_norm": 0.06072620674967766, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7015 + }, + { + "epoch": 5.603833865814696, + "grad_norm": 0.07509211450815201, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7016 + }, + { + "epoch": 5.604632587859425, + "grad_norm": 0.07241938263177872, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7017 + }, + { + "epoch": 5.605431309904153, + "grad_norm": 0.05110672488808632, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7018 + }, + { + "epoch": 5.606230031948882, + "grad_norm": 0.043005820363759995, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7019 + }, + { + "epoch": 5.60702875399361, + "grad_norm": 0.06298743188381195, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7020 + }, + { + "epoch": 5.607827476038339, + "grad_norm": 0.09457913786172867, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7021 + }, + { + "epoch": 5.608626198083067, + "grad_norm": 0.08066218346357346, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7022 + }, + { + "epoch": 5.609424920127795, + "grad_norm": 0.0845603421330452, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7023 + }, + { + "epoch": 5.6102236421725244, + "grad_norm": 0.09121926873922348, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7024 + }, + { + "epoch": 5.611022364217252, + "grad_norm": 0.12013491243124008, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7025 + }, + { + "epoch": 5.611821086261981, + "grad_norm": 0.062171660363674164, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7026 + }, + { + "epoch": 5.612619808306709, + "grad_norm": 0.05688954144716263, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7027 + }, + { + "epoch": 5.613418530351438, + "grad_norm": 0.049224793910980225, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7028 + }, + { + "epoch": 5.614217252396166, + "grad_norm": 0.06337599456310272, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7029 + }, + { + "epoch": 5.615015974440895, + "grad_norm": 0.03602084144949913, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7030 + }, + { + "epoch": 5.615814696485623, + "grad_norm": 0.06257645785808563, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7031 + }, + { + "epoch": 5.616613418530352, + "grad_norm": 0.09524381905794144, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7032 + }, + { + "epoch": 5.61741214057508, + "grad_norm": 0.06262468546628952, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7033 + }, + { + "epoch": 5.618210862619808, + "grad_norm": 0.23001722991466522, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7034 + }, + { + "epoch": 5.6190095846645365, + "grad_norm": 0.06312809139490128, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7035 + }, + { + "epoch": 5.619808306709265, + "grad_norm": 0.055973440408706665, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7036 + }, + { + "epoch": 5.6206070287539935, + "grad_norm": 0.0943455770611763, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7037 + }, + { + "epoch": 5.621405750798722, + "grad_norm": 0.05577901378273964, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7038 + }, + { + "epoch": 5.622204472843451, + "grad_norm": 0.057599395513534546, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7039 + }, + { + "epoch": 5.623003194888179, + "grad_norm": 0.07785748690366745, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 7040 + }, + { + "epoch": 5.623801916932908, + "grad_norm": 0.04796557500958443, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7041 + }, + { + "epoch": 5.624600638977636, + "grad_norm": 0.19438667595386505, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7042 + }, + { + "epoch": 5.625399361022364, + "grad_norm": 0.10055433958768845, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7043 + }, + { + "epoch": 5.626198083067092, + "grad_norm": 0.06082126125693321, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7044 + }, + { + "epoch": 5.626996805111821, + "grad_norm": 0.07862866669893265, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7045 + }, + { + "epoch": 5.627795527156549, + "grad_norm": 0.09042234718799591, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7046 + }, + { + "epoch": 5.628594249201278, + "grad_norm": 0.06087128072977066, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7047 + }, + { + "epoch": 5.6293929712460065, + "grad_norm": 0.04091280326247215, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7048 + }, + { + "epoch": 5.630191693290735, + "grad_norm": 0.0625537633895874, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7049 + }, + { + "epoch": 5.6309904153354635, + "grad_norm": 0.04506808891892433, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7050 + }, + { + "epoch": 5.631789137380192, + "grad_norm": 0.0750357061624527, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7051 + }, + { + "epoch": 5.63258785942492, + "grad_norm": 0.06990372389554977, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7052 + }, + { + "epoch": 5.633386581469648, + "grad_norm": 0.05008876323699951, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7053 + }, + { + "epoch": 5.634185303514377, + "grad_norm": 0.07472547143697739, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7054 + }, + { + "epoch": 5.634984025559105, + "grad_norm": 0.04004117101430893, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7055 + }, + { + "epoch": 5.635782747603834, + "grad_norm": 0.10103464871644974, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7056 + }, + { + "epoch": 5.636581469648562, + "grad_norm": 0.10850277543067932, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7057 + }, + { + "epoch": 5.637380191693291, + "grad_norm": 0.1109318807721138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7058 + }, + { + "epoch": 5.638178913738019, + "grad_norm": 0.06371457874774933, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7059 + }, + { + "epoch": 5.638977635782748, + "grad_norm": 0.1320749819278717, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7060 + }, + { + "epoch": 5.6397763578274756, + "grad_norm": 0.11957977712154388, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7061 + }, + { + "epoch": 5.640575079872205, + "grad_norm": 0.10327479988336563, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7062 + }, + { + "epoch": 5.641373801916933, + "grad_norm": 0.09731981158256531, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7063 + }, + { + "epoch": 5.642172523961661, + "grad_norm": 0.10276936739683151, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7064 + }, + { + "epoch": 5.64297124600639, + "grad_norm": 0.06973864883184433, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7065 + }, + { + "epoch": 5.643769968051118, + "grad_norm": 0.12020955234766006, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7066 + }, + { + "epoch": 5.644568690095847, + "grad_norm": 0.15950947999954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7067 + }, + { + "epoch": 5.645367412140575, + "grad_norm": 0.08034086227416992, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7068 + }, + { + "epoch": 5.646166134185304, + "grad_norm": 0.11269761621952057, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7069 + }, + { + "epoch": 5.646964856230032, + "grad_norm": 0.1569385826587677, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7070 + }, + { + "epoch": 5.647763578274761, + "grad_norm": 0.09290867298841476, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7071 + }, + { + "epoch": 5.6485623003194885, + "grad_norm": 0.0742817223072052, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7072 + }, + { + "epoch": 5.649361022364217, + "grad_norm": 0.3531377911567688, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7073 + }, + { + "epoch": 5.6501597444089455, + "grad_norm": 0.05365251749753952, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7074 + }, + { + "epoch": 5.650958466453674, + "grad_norm": 0.10185245424509048, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7075 + }, + { + "epoch": 5.651757188498403, + "grad_norm": 0.08978144079446793, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7076 + }, + { + "epoch": 5.652555910543131, + "grad_norm": 0.06563816964626312, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7077 + }, + { + "epoch": 5.65335463258786, + "grad_norm": 0.11167218536138535, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7078 + }, + { + "epoch": 5.654153354632588, + "grad_norm": 0.10078081488609314, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7079 + }, + { + "epoch": 5.654952076677317, + "grad_norm": 0.04581546410918236, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 7080 + }, + { + "epoch": 5.655750798722044, + "grad_norm": 0.04128880053758621, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7081 + }, + { + "epoch": 5.656549520766773, + "grad_norm": 0.0887683555483818, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7082 + }, + { + "epoch": 5.657348242811501, + "grad_norm": 0.06673122197389603, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7083 + }, + { + "epoch": 5.65814696485623, + "grad_norm": 0.12348195165395737, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7084 + }, + { + "epoch": 5.6589456869009584, + "grad_norm": 0.04828948527574539, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7085 + }, + { + "epoch": 5.659744408945687, + "grad_norm": 0.09094297885894775, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7086 + }, + { + "epoch": 5.6605431309904155, + "grad_norm": 0.05775933712720871, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7087 + }, + { + "epoch": 5.661341853035144, + "grad_norm": 0.06460239738225937, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7088 + }, + { + "epoch": 5.662140575079873, + "grad_norm": 0.07246532291173935, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7089 + }, + { + "epoch": 5.6629392971246, + "grad_norm": 0.05635413900017738, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7090 + }, + { + "epoch": 5.663738019169329, + "grad_norm": 0.05866781249642372, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7091 + }, + { + "epoch": 5.664536741214057, + "grad_norm": 0.11024738848209381, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7092 + }, + { + "epoch": 5.665335463258786, + "grad_norm": 2.880472421646118, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7093 + }, + { + "epoch": 5.666134185303514, + "grad_norm": 0.147624671459198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7094 + }, + { + "epoch": 5.666932907348243, + "grad_norm": 0.16042540967464447, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7095 + }, + { + "epoch": 5.667731629392971, + "grad_norm": 0.044081881642341614, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7096 + }, + { + "epoch": 5.6685303514377, + "grad_norm": 0.1580066829919815, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7097 + }, + { + "epoch": 5.669329073482428, + "grad_norm": 0.1348607987165451, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7098 + }, + { + "epoch": 5.670127795527156, + "grad_norm": 0.06525023281574249, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7099 + }, + { + "epoch": 5.6709265175718855, + "grad_norm": 0.12954704463481903, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7100 + }, + { + "epoch": 5.671725239616613, + "grad_norm": 0.09241525083780289, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7101 + }, + { + "epoch": 5.672523961661342, + "grad_norm": 0.05581163614988327, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7102 + }, + { + "epoch": 5.67332268370607, + "grad_norm": 0.0864885225892067, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7103 + }, + { + "epoch": 5.674121405750799, + "grad_norm": 0.0783633440732956, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7104 + }, + { + "epoch": 5.674920127795527, + "grad_norm": 2.419416666030884, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7105 + }, + { + "epoch": 5.675718849840256, + "grad_norm": 0.30067741870880127, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7106 + }, + { + "epoch": 5.676517571884984, + "grad_norm": 0.2876960337162018, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 7107 + }, + { + "epoch": 5.677316293929713, + "grad_norm": 0.13828304409980774, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7108 + }, + { + "epoch": 5.678115015974441, + "grad_norm": 0.12691721320152283, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7109 + }, + { + "epoch": 5.678913738019169, + "grad_norm": 0.18356311321258545, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 7110 + }, + { + "epoch": 5.6797124600638975, + "grad_norm": 0.13121426105499268, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7111 + }, + { + "epoch": 5.680511182108626, + "grad_norm": 0.13354304432868958, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7112 + }, + { + "epoch": 5.681309904153355, + "grad_norm": 0.10858450084924698, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7113 + }, + { + "epoch": 5.682108626198083, + "grad_norm": 0.12026678770780563, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 7114 + }, + { + "epoch": 5.682907348242812, + "grad_norm": 0.10297723114490509, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7115 + }, + { + "epoch": 5.68370607028754, + "grad_norm": 0.10481604188680649, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7116 + }, + { + "epoch": 5.684504792332269, + "grad_norm": 0.1389889419078827, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7117 + }, + { + "epoch": 5.685303514376997, + "grad_norm": 0.047913264483213425, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7118 + }, + { + "epoch": 5.686102236421725, + "grad_norm": 0.07504977285861969, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7119 + }, + { + "epoch": 5.686900958466453, + "grad_norm": 0.08858702331781387, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7120 + }, + { + "epoch": 5.687699680511182, + "grad_norm": 0.07746905088424683, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7121 + }, + { + "epoch": 5.68849840255591, + "grad_norm": 0.20370569825172424, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7122 + }, + { + "epoch": 5.689297124600639, + "grad_norm": 0.053284503519535065, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7123 + }, + { + "epoch": 5.6900958466453675, + "grad_norm": 0.08579347282648087, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7124 + }, + { + "epoch": 5.690894568690096, + "grad_norm": 0.11220933496952057, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7125 + }, + { + "epoch": 5.6916932907348246, + "grad_norm": 0.11851351708173752, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 7126 + }, + { + "epoch": 5.692492012779553, + "grad_norm": 0.0839112401008606, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7127 + }, + { + "epoch": 5.693290734824281, + "grad_norm": 0.07717803865671158, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7128 + }, + { + "epoch": 5.694089456869009, + "grad_norm": 0.10219333320856094, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7129 + }, + { + "epoch": 5.694888178913738, + "grad_norm": 0.06746016442775726, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7130 + }, + { + "epoch": 5.695686900958466, + "grad_norm": 0.09630785137414932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7131 + }, + { + "epoch": 5.696485623003195, + "grad_norm": 0.059845466166734695, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7132 + }, + { + "epoch": 5.697284345047923, + "grad_norm": 0.10587267577648163, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7133 + }, + { + "epoch": 5.698083067092652, + "grad_norm": 0.12221334874629974, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7134 + }, + { + "epoch": 5.69888178913738, + "grad_norm": 0.1638030856847763, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7135 + }, + { + "epoch": 5.699680511182109, + "grad_norm": 0.04686988145112991, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7136 + }, + { + "epoch": 5.700479233226837, + "grad_norm": 0.09120972454547882, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7137 + }, + { + "epoch": 5.701277955271565, + "grad_norm": 0.1081257089972496, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7138 + }, + { + "epoch": 5.702076677316294, + "grad_norm": 0.07313218712806702, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7139 + }, + { + "epoch": 5.702875399361022, + "grad_norm": 0.06039511039853096, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7140 + }, + { + "epoch": 5.703674121405751, + "grad_norm": 0.14473693072795868, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7141 + }, + { + "epoch": 5.704472843450479, + "grad_norm": 0.15062592923641205, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7142 + }, + { + "epoch": 5.705271565495208, + "grad_norm": 0.09711029380559921, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7143 + }, + { + "epoch": 5.706070287539936, + "grad_norm": 0.056874651461839676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7144 + }, + { + "epoch": 5.706869009584665, + "grad_norm": 0.1077205091714859, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7145 + }, + { + "epoch": 5.707667731629393, + "grad_norm": 0.1437366008758545, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7146 + }, + { + "epoch": 5.708466453674122, + "grad_norm": 0.06206873059272766, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7147 + }, + { + "epoch": 5.7092651757188495, + "grad_norm": 0.06379563361406326, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7148 + }, + { + "epoch": 5.710063897763578, + "grad_norm": 0.11586727946996689, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7149 + }, + { + "epoch": 5.710862619808307, + "grad_norm": 0.12792269885540009, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7150 + }, + { + "epoch": 5.711661341853035, + "grad_norm": 0.08514344692230225, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7151 + }, + { + "epoch": 5.712460063897764, + "grad_norm": 0.045359376817941666, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7152 + }, + { + "epoch": 5.713258785942492, + "grad_norm": 0.13782942295074463, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7153 + }, + { + "epoch": 5.714057507987221, + "grad_norm": 0.1362733691930771, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7154 + }, + { + "epoch": 5.714856230031949, + "grad_norm": 0.11249929666519165, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7155 + }, + { + "epoch": 5.715654952076678, + "grad_norm": 0.07308060675859451, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7156 + }, + { + "epoch": 5.716453674121405, + "grad_norm": 0.08434231579303741, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7157 + }, + { + "epoch": 5.717252396166134, + "grad_norm": 0.0800870731472969, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7158 + }, + { + "epoch": 5.718051118210862, + "grad_norm": 0.09833595156669617, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 7159 + }, + { + "epoch": 5.718849840255591, + "grad_norm": 0.06979871541261673, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7160 + }, + { + "epoch": 5.7196485623003195, + "grad_norm": 0.3326590657234192, + "learning_rate": 0.0005, + "loss": 1.0471, + "step": 7161 + }, + { + "epoch": 5.720447284345048, + "grad_norm": 0.07953538745641708, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7162 + }, + { + "epoch": 5.7212460063897765, + "grad_norm": 0.06084589287638664, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7163 + }, + { + "epoch": 5.722044728434505, + "grad_norm": 0.05060078203678131, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7164 + }, + { + "epoch": 5.722843450479234, + "grad_norm": 0.11765584349632263, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7165 + }, + { + "epoch": 5.723642172523961, + "grad_norm": 0.11147762089967728, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7166 + }, + { + "epoch": 5.72444089456869, + "grad_norm": 0.051353756338357925, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7167 + }, + { + "epoch": 5.725239616613418, + "grad_norm": 0.06255709379911423, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7168 + }, + { + "epoch": 5.726038338658147, + "grad_norm": 0.048915427178144455, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7169 + }, + { + "epoch": 5.726837060702875, + "grad_norm": 0.057233601808547974, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7170 + }, + { + "epoch": 5.727635782747604, + "grad_norm": 0.0828251764178276, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7171 + }, + { + "epoch": 5.728434504792332, + "grad_norm": 0.07387874275445938, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7172 + }, + { + "epoch": 5.729233226837061, + "grad_norm": 0.04857983812689781, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7173 + }, + { + "epoch": 5.7300319488817895, + "grad_norm": 0.07202452421188354, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7174 + }, + { + "epoch": 5.730830670926517, + "grad_norm": 0.4291386306285858, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7175 + }, + { + "epoch": 5.731629392971246, + "grad_norm": 0.07219598442316055, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7176 + }, + { + "epoch": 5.732428115015974, + "grad_norm": 0.07889580726623535, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7177 + }, + { + "epoch": 5.733226837060703, + "grad_norm": 0.1154242753982544, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7178 + }, + { + "epoch": 5.734025559105431, + "grad_norm": 0.1711360067129135, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7179 + }, + { + "epoch": 5.73482428115016, + "grad_norm": 0.15897679328918457, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7180 + }, + { + "epoch": 5.735623003194888, + "grad_norm": 0.056718453764915466, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7181 + }, + { + "epoch": 5.736421725239617, + "grad_norm": 0.10130516439676285, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7182 + }, + { + "epoch": 5.737220447284345, + "grad_norm": 0.10965991020202637, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7183 + }, + { + "epoch": 5.738019169329074, + "grad_norm": 0.043925706297159195, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7184 + }, + { + "epoch": 5.738817891373802, + "grad_norm": 0.16040641069412231, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7185 + }, + { + "epoch": 5.73961661341853, + "grad_norm": 0.545796275138855, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7186 + }, + { + "epoch": 5.7404153354632586, + "grad_norm": 0.12285015732049942, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7187 + }, + { + "epoch": 5.741214057507987, + "grad_norm": 0.1241980791091919, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 7188 + }, + { + "epoch": 5.742012779552716, + "grad_norm": 0.18415005505084991, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7189 + }, + { + "epoch": 5.742811501597444, + "grad_norm": 0.1455639749765396, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7190 + }, + { + "epoch": 5.743610223642173, + "grad_norm": 0.05731341987848282, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7191 + }, + { + "epoch": 5.744408945686901, + "grad_norm": 0.10810694098472595, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7192 + }, + { + "epoch": 5.74520766773163, + "grad_norm": 0.13279423117637634, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7193 + }, + { + "epoch": 5.746006389776358, + "grad_norm": 0.048075832426548004, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7194 + }, + { + "epoch": 5.746805111821086, + "grad_norm": 0.07276510447263718, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7195 + }, + { + "epoch": 5.747603833865814, + "grad_norm": 0.0666821077466011, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7196 + }, + { + "epoch": 5.748402555910543, + "grad_norm": 0.0950300320982933, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7197 + }, + { + "epoch": 5.7492012779552715, + "grad_norm": 0.07229208946228027, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7198 + }, + { + "epoch": 5.75, + "grad_norm": 0.08129260689020157, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7199 + }, + { + "epoch": 5.7507987220447285, + "grad_norm": 0.08685708791017532, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7200 + }, + { + "epoch": 5.751597444089457, + "grad_norm": 0.048116523772478104, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7201 + }, + { + "epoch": 5.752396166134186, + "grad_norm": 0.08470416814088821, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7202 + }, + { + "epoch": 5.753194888178914, + "grad_norm": 0.09388689696788788, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7203 + }, + { + "epoch": 5.753993610223642, + "grad_norm": 0.07961093634366989, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7204 + }, + { + "epoch": 5.75479233226837, + "grad_norm": 0.05949364975094795, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7205 + }, + { + "epoch": 5.755591054313099, + "grad_norm": 0.10149726271629333, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7206 + }, + { + "epoch": 5.756389776357827, + "grad_norm": 0.30414992570877075, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7207 + }, + { + "epoch": 5.757188498402556, + "grad_norm": 0.06670042872428894, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7208 + }, + { + "epoch": 5.757987220447284, + "grad_norm": 0.061501920223236084, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7209 + }, + { + "epoch": 5.758785942492013, + "grad_norm": 0.06627584993839264, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7210 + }, + { + "epoch": 5.7595846645367414, + "grad_norm": 0.1268157660961151, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7211 + }, + { + "epoch": 5.76038338658147, + "grad_norm": 0.10253716260194778, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7212 + }, + { + "epoch": 5.761182108626198, + "grad_norm": 0.08384321630001068, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7213 + }, + { + "epoch": 5.761980830670926, + "grad_norm": 0.09078267216682434, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7214 + }, + { + "epoch": 5.762779552715655, + "grad_norm": 0.10487394034862518, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7215 + }, + { + "epoch": 5.763578274760383, + "grad_norm": 0.12192805856466293, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7216 + }, + { + "epoch": 5.764376996805112, + "grad_norm": 0.16597039997577667, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7217 + }, + { + "epoch": 5.76517571884984, + "grad_norm": 0.08498643338680267, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7218 + }, + { + "epoch": 5.765974440894569, + "grad_norm": 0.12794862687587738, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7219 + }, + { + "epoch": 5.766773162939297, + "grad_norm": 0.13595858216285706, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7220 + }, + { + "epoch": 5.767571884984026, + "grad_norm": 0.08182058483362198, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7221 + }, + { + "epoch": 5.768370607028754, + "grad_norm": 0.11747279763221741, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7222 + }, + { + "epoch": 5.769169329073483, + "grad_norm": 0.13400238752365112, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7223 + }, + { + "epoch": 5.7699680511182105, + "grad_norm": 0.18527893722057343, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7224 + }, + { + "epoch": 5.770766773162939, + "grad_norm": 0.05130131170153618, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7225 + }, + { + "epoch": 5.771565495207668, + "grad_norm": 0.14139772951602936, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7226 + }, + { + "epoch": 5.772364217252396, + "grad_norm": 0.07901434600353241, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7227 + }, + { + "epoch": 5.773162939297125, + "grad_norm": 0.0642717182636261, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7228 + }, + { + "epoch": 5.773961661341853, + "grad_norm": 0.0693419873714447, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7229 + }, + { + "epoch": 5.774760383386582, + "grad_norm": 0.06490292400121689, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7230 + }, + { + "epoch": 5.77555910543131, + "grad_norm": 0.09405414760112762, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7231 + }, + { + "epoch": 5.776357827476039, + "grad_norm": 0.10439605265855789, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7232 + }, + { + "epoch": 5.777156549520766, + "grad_norm": 0.06811316311359406, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7233 + }, + { + "epoch": 5.777955271565495, + "grad_norm": 0.0707770362496376, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7234 + }, + { + "epoch": 5.7787539936102235, + "grad_norm": 0.08751409500837326, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7235 + }, + { + "epoch": 5.779552715654952, + "grad_norm": 0.09626015275716782, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7236 + }, + { + "epoch": 5.7803514376996805, + "grad_norm": 0.11487453430891037, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7237 + }, + { + "epoch": 5.781150159744409, + "grad_norm": 0.06278856843709946, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7238 + }, + { + "epoch": 5.781948881789138, + "grad_norm": 0.131802499294281, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7239 + }, + { + "epoch": 5.782747603833866, + "grad_norm": 0.09209976345300674, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7240 + }, + { + "epoch": 5.783546325878595, + "grad_norm": 0.06524617224931717, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7241 + }, + { + "epoch": 5.784345047923322, + "grad_norm": 0.10735169053077698, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7242 + }, + { + "epoch": 5.785143769968051, + "grad_norm": 0.08926022797822952, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7243 + }, + { + "epoch": 5.785942492012779, + "grad_norm": 0.08254969120025635, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7244 + }, + { + "epoch": 5.786741214057508, + "grad_norm": 0.07478158175945282, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7245 + }, + { + "epoch": 5.787539936102236, + "grad_norm": 0.0974164679646492, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7246 + }, + { + "epoch": 5.788338658146965, + "grad_norm": 0.05145352706313133, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7247 + }, + { + "epoch": 5.789137380191693, + "grad_norm": 0.11986715346574783, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 7248 + }, + { + "epoch": 5.789936102236422, + "grad_norm": 0.12020506709814072, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7249 + }, + { + "epoch": 5.7907348242811505, + "grad_norm": 0.07199704647064209, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7250 + }, + { + "epoch": 5.791533546325878, + "grad_norm": 0.10702182352542877, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7251 + }, + { + "epoch": 5.792332268370607, + "grad_norm": 0.10817115753889084, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7252 + }, + { + "epoch": 5.793130990415335, + "grad_norm": 0.1875494122505188, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7253 + }, + { + "epoch": 5.793929712460064, + "grad_norm": 0.07347052544355392, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7254 + }, + { + "epoch": 5.794728434504792, + "grad_norm": 0.08588847517967224, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7255 + }, + { + "epoch": 5.795527156549521, + "grad_norm": 0.08241020143032074, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7256 + }, + { + "epoch": 5.796325878594249, + "grad_norm": 0.06322775781154633, + "learning_rate": 0.0005, + "loss": 1.046, + "step": 7257 + }, + { + "epoch": 5.797124600638978, + "grad_norm": 0.10279159247875214, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7258 + }, + { + "epoch": 5.797923322683706, + "grad_norm": 0.1887427717447281, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7259 + }, + { + "epoch": 5.798722044728435, + "grad_norm": 0.12288179248571396, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7260 + }, + { + "epoch": 5.799520766773163, + "grad_norm": 0.07014663517475128, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7261 + }, + { + "epoch": 5.800319488817891, + "grad_norm": 0.3741980493068695, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7262 + }, + { + "epoch": 5.80111821086262, + "grad_norm": 0.10083315521478653, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7263 + }, + { + "epoch": 5.801916932907348, + "grad_norm": 0.06427261233329773, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7264 + }, + { + "epoch": 5.802715654952077, + "grad_norm": 0.06265366077423096, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7265 + }, + { + "epoch": 5.803514376996805, + "grad_norm": 0.09602728486061096, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7266 + }, + { + "epoch": 5.804313099041534, + "grad_norm": 0.10369620472192764, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7267 + }, + { + "epoch": 5.805111821086262, + "grad_norm": 0.09742012619972229, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7268 + }, + { + "epoch": 5.805910543130991, + "grad_norm": 0.11579136550426483, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7269 + }, + { + "epoch": 5.806709265175719, + "grad_norm": 0.11265771090984344, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7270 + }, + { + "epoch": 5.807507987220447, + "grad_norm": 0.10684274882078171, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7271 + }, + { + "epoch": 5.8083067092651754, + "grad_norm": 0.12550850212574005, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7272 + }, + { + "epoch": 5.809105431309904, + "grad_norm": 0.04966668784618378, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7273 + }, + { + "epoch": 5.8099041533546325, + "grad_norm": 0.26124852895736694, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7274 + }, + { + "epoch": 5.810702875399361, + "grad_norm": 0.12293774634599686, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7275 + }, + { + "epoch": 5.81150159744409, + "grad_norm": 0.11183387041091919, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7276 + }, + { + "epoch": 5.812300319488818, + "grad_norm": 0.08738099783658981, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7277 + }, + { + "epoch": 5.813099041533547, + "grad_norm": 0.06429604440927505, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 7278 + }, + { + "epoch": 5.813897763578275, + "grad_norm": 0.09102299064397812, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7279 + }, + { + "epoch": 5.814696485623003, + "grad_norm": 0.06249788776040077, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7280 + }, + { + "epoch": 5.815495207667731, + "grad_norm": 0.08752568066120148, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7281 + }, + { + "epoch": 5.81629392971246, + "grad_norm": 0.06289692968130112, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7282 + }, + { + "epoch": 5.817092651757188, + "grad_norm": 0.1269187480211258, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7283 + }, + { + "epoch": 5.817891373801917, + "grad_norm": 0.0839361846446991, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7284 + }, + { + "epoch": 5.818690095846645, + "grad_norm": 0.0855027437210083, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7285 + }, + { + "epoch": 5.819488817891374, + "grad_norm": 0.20559446513652802, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7286 + }, + { + "epoch": 5.8202875399361025, + "grad_norm": 0.0740990862250328, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7287 + }, + { + "epoch": 5.821086261980831, + "grad_norm": 0.06762924790382385, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7288 + }, + { + "epoch": 5.821884984025559, + "grad_norm": 0.5238296985626221, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7289 + }, + { + "epoch": 5.822683706070287, + "grad_norm": 0.09929470717906952, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7290 + }, + { + "epoch": 5.823482428115016, + "grad_norm": 0.11528550088405609, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7291 + }, + { + "epoch": 5.824281150159744, + "grad_norm": 0.10563576966524124, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 7292 + }, + { + "epoch": 5.825079872204473, + "grad_norm": 0.13924843072891235, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7293 + }, + { + "epoch": 5.825878594249201, + "grad_norm": 0.1332271546125412, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7294 + }, + { + "epoch": 5.82667731629393, + "grad_norm": 0.15709803998470306, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7295 + }, + { + "epoch": 5.827476038338658, + "grad_norm": 0.19638708233833313, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7296 + }, + { + "epoch": 5.828274760383387, + "grad_norm": 0.16845624148845673, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7297 + }, + { + "epoch": 5.8290734824281145, + "grad_norm": 0.15753695368766785, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7298 + }, + { + "epoch": 5.829872204472844, + "grad_norm": 0.04734346270561218, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 7299 + }, + { + "epoch": 5.830670926517572, + "grad_norm": 0.48153460025787354, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7300 + }, + { + "epoch": 5.8314696485623, + "grad_norm": 0.09118880331516266, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7301 + }, + { + "epoch": 5.832268370607029, + "grad_norm": 0.10301438719034195, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7302 + }, + { + "epoch": 5.833067092651757, + "grad_norm": 0.12838974595069885, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7303 + }, + { + "epoch": 5.833865814696486, + "grad_norm": 0.1537700593471527, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7304 + }, + { + "epoch": 5.834664536741214, + "grad_norm": 0.08763979375362396, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7305 + }, + { + "epoch": 5.835463258785943, + "grad_norm": 0.2613058388233185, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 7306 + }, + { + "epoch": 5.836261980830671, + "grad_norm": 0.13767825067043304, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7307 + }, + { + "epoch": 5.8370607028754, + "grad_norm": 0.14907905459403992, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7308 + }, + { + "epoch": 5.837859424920127, + "grad_norm": 0.3314233124256134, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 7309 + }, + { + "epoch": 5.838658146964856, + "grad_norm": 0.1368636041879654, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7310 + }, + { + "epoch": 5.8394568690095845, + "grad_norm": 0.13423767685890198, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7311 + }, + { + "epoch": 5.840255591054313, + "grad_norm": 0.08914478868246078, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7312 + }, + { + "epoch": 5.8410543130990416, + "grad_norm": 0.09363356977701187, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7313 + }, + { + "epoch": 5.84185303514377, + "grad_norm": 0.226780965924263, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7314 + }, + { + "epoch": 5.842651757188499, + "grad_norm": 0.09002092480659485, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7315 + }, + { + "epoch": 5.843450479233227, + "grad_norm": 0.06387127935886383, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7316 + }, + { + "epoch": 5.844249201277956, + "grad_norm": 0.1643945276737213, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7317 + }, + { + "epoch": 5.845047923322683, + "grad_norm": 0.13561291992664337, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7318 + }, + { + "epoch": 5.845846645367412, + "grad_norm": 0.14334949851036072, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7319 + }, + { + "epoch": 5.84664536741214, + "grad_norm": 0.13982698321342468, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7320 + }, + { + "epoch": 5.847444089456869, + "grad_norm": 0.10822772979736328, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7321 + }, + { + "epoch": 5.848242811501597, + "grad_norm": 0.07073087245225906, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7322 + }, + { + "epoch": 5.849041533546326, + "grad_norm": 0.09560684859752655, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7323 + }, + { + "epoch": 5.8498402555910545, + "grad_norm": 0.0882779061794281, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7324 + }, + { + "epoch": 5.850638977635783, + "grad_norm": 0.17319771647453308, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7325 + }, + { + "epoch": 5.8514376996805115, + "grad_norm": 0.12140306830406189, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7326 + }, + { + "epoch": 5.852236421725239, + "grad_norm": 0.12064560502767563, + "learning_rate": 0.0005, + "loss": 1.0428, + "step": 7327 + }, + { + "epoch": 5.853035143769968, + "grad_norm": 0.0733642578125, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7328 + }, + { + "epoch": 5.853833865814696, + "grad_norm": 0.08563291281461716, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7329 + }, + { + "epoch": 5.854632587859425, + "grad_norm": 0.11337493360042572, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7330 + }, + { + "epoch": 5.855431309904153, + "grad_norm": 0.12164553254842758, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 7331 + }, + { + "epoch": 5.856230031948882, + "grad_norm": 0.06406484544277191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7332 + }, + { + "epoch": 5.85702875399361, + "grad_norm": 0.0765780508518219, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7333 + }, + { + "epoch": 5.857827476038339, + "grad_norm": 0.12847815454006195, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7334 + }, + { + "epoch": 5.858626198083067, + "grad_norm": 0.11934550106525421, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7335 + }, + { + "epoch": 5.859424920127795, + "grad_norm": 0.08170188963413239, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7336 + }, + { + "epoch": 5.8602236421725244, + "grad_norm": 0.13636507093906403, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7337 + }, + { + "epoch": 5.861022364217252, + "grad_norm": 0.11030741780996323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7338 + }, + { + "epoch": 5.861821086261981, + "grad_norm": 0.10200777649879456, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7339 + }, + { + "epoch": 5.862619808306709, + "grad_norm": 0.09916897118091583, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7340 + }, + { + "epoch": 5.863418530351438, + "grad_norm": 0.08136509358882904, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 7341 + }, + { + "epoch": 5.864217252396166, + "grad_norm": 0.051609545946121216, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7342 + }, + { + "epoch": 5.865015974440895, + "grad_norm": 0.061890844255685806, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7343 + }, + { + "epoch": 5.865814696485623, + "grad_norm": 0.10308966040611267, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7344 + }, + { + "epoch": 5.866613418530352, + "grad_norm": 0.06762709468603134, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7345 + }, + { + "epoch": 5.86741214057508, + "grad_norm": 0.07767036557197571, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7346 + }, + { + "epoch": 5.868210862619808, + "grad_norm": 0.10608458518981934, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7347 + }, + { + "epoch": 5.8690095846645365, + "grad_norm": 0.13812315464019775, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7348 + }, + { + "epoch": 5.869808306709265, + "grad_norm": 0.10485442727804184, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7349 + }, + { + "epoch": 5.8706070287539935, + "grad_norm": 0.08510198444128036, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7350 + }, + { + "epoch": 5.871405750798722, + "grad_norm": 0.17235122621059418, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7351 + }, + { + "epoch": 5.872204472843451, + "grad_norm": 0.057075515389442444, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7352 + }, + { + "epoch": 5.873003194888179, + "grad_norm": 0.3470633924007416, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7353 + }, + { + "epoch": 5.873801916932908, + "grad_norm": 0.1859748661518097, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 7354 + }, + { + "epoch": 5.874600638977636, + "grad_norm": 0.2350156307220459, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7355 + }, + { + "epoch": 5.875399361022364, + "grad_norm": 0.11264859884977341, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 7356 + }, + { + "epoch": 5.876198083067092, + "grad_norm": 0.2859210968017578, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7357 + }, + { + "epoch": 5.876996805111821, + "grad_norm": 0.08706829696893692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7358 + }, + { + "epoch": 5.877795527156549, + "grad_norm": 0.0644318088889122, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7359 + }, + { + "epoch": 5.878594249201278, + "grad_norm": 0.10985474288463593, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7360 + }, + { + "epoch": 5.8793929712460065, + "grad_norm": 0.09968867897987366, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7361 + }, + { + "epoch": 5.880191693290735, + "grad_norm": 0.07277355343103409, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 7362 + }, + { + "epoch": 5.8809904153354635, + "grad_norm": 0.043085962533950806, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7363 + }, + { + "epoch": 5.881789137380192, + "grad_norm": 0.10392415523529053, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7364 + }, + { + "epoch": 5.88258785942492, + "grad_norm": 0.05523041635751724, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7365 + }, + { + "epoch": 5.883386581469648, + "grad_norm": 0.1754276603460312, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7366 + }, + { + "epoch": 5.884185303514377, + "grad_norm": 0.09561391174793243, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7367 + }, + { + "epoch": 5.884984025559105, + "grad_norm": 0.17572976648807526, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7368 + }, + { + "epoch": 5.885782747603834, + "grad_norm": 0.06476190686225891, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7369 + }, + { + "epoch": 5.886581469648562, + "grad_norm": 0.08763223886489868, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7370 + }, + { + "epoch": 5.887380191693291, + "grad_norm": 0.04419226944446564, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7371 + }, + { + "epoch": 5.888178913738019, + "grad_norm": 0.08707522600889206, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 7372 + }, + { + "epoch": 5.888977635782748, + "grad_norm": 0.3117498457431793, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7373 + }, + { + "epoch": 5.8897763578274756, + "grad_norm": 0.04153338074684143, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7374 + }, + { + "epoch": 5.890575079872205, + "grad_norm": 0.10575849562883377, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7375 + }, + { + "epoch": 5.891373801916933, + "grad_norm": 0.07147886604070663, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7376 + }, + { + "epoch": 5.892172523961661, + "grad_norm": 0.05394810438156128, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7377 + }, + { + "epoch": 5.89297124600639, + "grad_norm": 0.15453197062015533, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7378 + }, + { + "epoch": 5.893769968051118, + "grad_norm": 0.19460639357566833, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7379 + }, + { + "epoch": 5.894568690095847, + "grad_norm": 0.13046157360076904, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7380 + }, + { + "epoch": 5.895367412140575, + "grad_norm": 0.09074800461530685, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7381 + }, + { + "epoch": 5.896166134185304, + "grad_norm": 0.09315948188304901, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7382 + }, + { + "epoch": 5.896964856230032, + "grad_norm": 0.0572352297604084, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7383 + }, + { + "epoch": 5.897763578274761, + "grad_norm": 0.09366700798273087, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7384 + }, + { + "epoch": 5.8985623003194885, + "grad_norm": 0.12643125653266907, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7385 + }, + { + "epoch": 5.899361022364217, + "grad_norm": 0.14831441640853882, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7386 + }, + { + "epoch": 5.9001597444089455, + "grad_norm": 0.06892798840999603, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7387 + }, + { + "epoch": 5.900958466453674, + "grad_norm": 0.24058189988136292, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7388 + }, + { + "epoch": 5.901757188498403, + "grad_norm": 0.12589944899082184, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7389 + }, + { + "epoch": 5.902555910543131, + "grad_norm": 0.10197508335113525, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7390 + }, + { + "epoch": 5.90335463258786, + "grad_norm": 0.04367182031273842, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7391 + }, + { + "epoch": 5.904153354632588, + "grad_norm": 0.11131702363491058, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7392 + }, + { + "epoch": 5.904952076677317, + "grad_norm": 0.10258752107620239, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7393 + }, + { + "epoch": 5.905750798722044, + "grad_norm": 0.05077935755252838, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7394 + }, + { + "epoch": 5.906549520766773, + "grad_norm": 0.13514964282512665, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7395 + }, + { + "epoch": 5.907348242811501, + "grad_norm": 0.365681916475296, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7396 + }, + { + "epoch": 5.90814696485623, + "grad_norm": 0.09199032932519913, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7397 + }, + { + "epoch": 5.9089456869009584, + "grad_norm": 0.10341943800449371, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7398 + }, + { + "epoch": 5.909744408945687, + "grad_norm": 0.05396822467446327, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7399 + }, + { + "epoch": 5.9105431309904155, + "grad_norm": 0.06582850217819214, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 7400 + }, + { + "epoch": 5.911341853035144, + "grad_norm": 0.04932714253664017, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7401 + }, + { + "epoch": 5.912140575079873, + "grad_norm": 0.08820181339979172, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7402 + }, + { + "epoch": 5.9129392971246, + "grad_norm": 0.08759067952632904, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7403 + }, + { + "epoch": 5.913738019169329, + "grad_norm": 0.0582246370613575, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7404 + }, + { + "epoch": 5.914536741214057, + "grad_norm": 0.3632248044013977, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7405 + }, + { + "epoch": 5.915335463258786, + "grad_norm": 0.054485730826854706, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 7406 + }, + { + "epoch": 5.916134185303514, + "grad_norm": 0.06776587665081024, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7407 + }, + { + "epoch": 5.916932907348243, + "grad_norm": 0.06876091659069061, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7408 + }, + { + "epoch": 5.917731629392971, + "grad_norm": 0.06507224589586258, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7409 + }, + { + "epoch": 5.9185303514377, + "grad_norm": 1.061123013496399, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7410 + }, + { + "epoch": 5.919329073482428, + "grad_norm": 0.2808170020580292, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7411 + }, + { + "epoch": 5.920127795527156, + "grad_norm": 0.2075907289981842, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7412 + }, + { + "epoch": 5.9209265175718855, + "grad_norm": 0.08707362413406372, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 7413 + }, + { + "epoch": 5.921725239616613, + "grad_norm": 0.17357248067855835, + "learning_rate": 0.0005, + "loss": 1.0666, + "step": 7414 + }, + { + "epoch": 5.922523961661342, + "grad_norm": 0.19713328778743744, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7415 + }, + { + "epoch": 5.92332268370607, + "grad_norm": 0.10456258803606033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7416 + }, + { + "epoch": 5.924121405750799, + "grad_norm": 0.10678638517856598, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 7417 + }, + { + "epoch": 5.924920127795527, + "grad_norm": 0.12577000260353088, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7418 + }, + { + "epoch": 5.925718849840256, + "grad_norm": 0.14730660617351532, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7419 + }, + { + "epoch": 5.926517571884984, + "grad_norm": 0.07055118680000305, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7420 + }, + { + "epoch": 5.927316293929713, + "grad_norm": 0.10249259322881699, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7421 + }, + { + "epoch": 5.928115015974441, + "grad_norm": 0.06859050691127777, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7422 + }, + { + "epoch": 5.928913738019169, + "grad_norm": 0.043517664074897766, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7423 + }, + { + "epoch": 5.9297124600638975, + "grad_norm": 0.06680947542190552, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7424 + }, + { + "epoch": 5.930511182108626, + "grad_norm": 0.07522429525852203, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7425 + }, + { + "epoch": 5.931309904153355, + "grad_norm": 0.15828543901443481, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 7426 + }, + { + "epoch": 5.932108626198083, + "grad_norm": 0.19134600460529327, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7427 + }, + { + "epoch": 5.932907348242812, + "grad_norm": 0.12455222010612488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7428 + }, + { + "epoch": 5.93370607028754, + "grad_norm": 0.11147905886173248, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7429 + }, + { + "epoch": 5.934504792332269, + "grad_norm": 0.1238674744963646, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7430 + }, + { + "epoch": 5.935303514376997, + "grad_norm": 0.15700307488441467, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 7431 + }, + { + "epoch": 5.936102236421725, + "grad_norm": 0.11487080156803131, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7432 + }, + { + "epoch": 5.936900958466453, + "grad_norm": 0.11961077898740768, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7433 + }, + { + "epoch": 5.937699680511182, + "grad_norm": 0.07594173401594162, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7434 + }, + { + "epoch": 5.93849840255591, + "grad_norm": 0.19439400732517242, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7435 + }, + { + "epoch": 5.939297124600639, + "grad_norm": 0.17745599150657654, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7436 + }, + { + "epoch": 5.9400958466453675, + "grad_norm": 0.15732692182064056, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7437 + }, + { + "epoch": 5.940894568690096, + "grad_norm": 0.08824916929006577, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7438 + }, + { + "epoch": 5.9416932907348246, + "grad_norm": 0.12354888767004013, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7439 + }, + { + "epoch": 5.942492012779553, + "grad_norm": 0.10940376669168472, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7440 + }, + { + "epoch": 5.943290734824281, + "grad_norm": 0.05808279290795326, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7441 + }, + { + "epoch": 5.944089456869009, + "grad_norm": 0.19519653916358948, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 7442 + }, + { + "epoch": 5.944888178913738, + "grad_norm": 0.07913058996200562, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 7443 + }, + { + "epoch": 5.945686900958466, + "grad_norm": 0.5150377750396729, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7444 + }, + { + "epoch": 5.946485623003195, + "grad_norm": 0.24083790183067322, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7445 + }, + { + "epoch": 5.947284345047923, + "grad_norm": 0.11291394382715225, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7446 + }, + { + "epoch": 5.948083067092652, + "grad_norm": 0.0899023786187172, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7447 + }, + { + "epoch": 5.94888178913738, + "grad_norm": 0.05489958077669144, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 7448 + }, + { + "epoch": 5.949680511182109, + "grad_norm": 0.12375161051750183, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7449 + }, + { + "epoch": 5.950479233226837, + "grad_norm": 0.11610512435436249, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7450 + }, + { + "epoch": 5.951277955271565, + "grad_norm": 0.06953240931034088, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7451 + }, + { + "epoch": 5.952076677316294, + "grad_norm": 0.09784717857837677, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7452 + }, + { + "epoch": 5.952875399361022, + "grad_norm": 0.059533409774303436, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7453 + }, + { + "epoch": 5.953674121405751, + "grad_norm": 0.06361017376184464, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7454 + }, + { + "epoch": 5.954472843450479, + "grad_norm": 0.33739587664604187, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7455 + }, + { + "epoch": 5.955271565495208, + "grad_norm": 0.0726039931178093, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7456 + }, + { + "epoch": 5.956070287539936, + "grad_norm": 0.047813788056373596, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7457 + }, + { + "epoch": 5.956869009584665, + "grad_norm": 0.05501490831375122, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7458 + }, + { + "epoch": 5.957667731629393, + "grad_norm": 0.24806374311447144, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7459 + }, + { + "epoch": 5.958466453674122, + "grad_norm": 0.09020408987998962, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7460 + }, + { + "epoch": 5.9592651757188495, + "grad_norm": 0.09845588356256485, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7461 + }, + { + "epoch": 5.960063897763578, + "grad_norm": 0.2733388841152191, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7462 + }, + { + "epoch": 5.960862619808307, + "grad_norm": 0.04368302598595619, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7463 + }, + { + "epoch": 5.961661341853035, + "grad_norm": 0.06559797376394272, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7464 + }, + { + "epoch": 5.962460063897764, + "grad_norm": 0.08194267004728317, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7465 + }, + { + "epoch": 5.963258785942492, + "grad_norm": 0.08440488576889038, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7466 + }, + { + "epoch": 5.964057507987221, + "grad_norm": 0.07046753168106079, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7467 + }, + { + "epoch": 5.964856230031949, + "grad_norm": 0.061910174787044525, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7468 + }, + { + "epoch": 5.965654952076678, + "grad_norm": 0.06781110167503357, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 7469 + }, + { + "epoch": 5.966453674121405, + "grad_norm": 0.0626576617360115, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 7470 + }, + { + "epoch": 5.967252396166134, + "grad_norm": 0.05339542031288147, + "learning_rate": 0.0005, + "loss": 1.0402, + "step": 7471 + }, + { + "epoch": 5.968051118210862, + "grad_norm": 0.09167633950710297, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7472 + }, + { + "epoch": 5.968849840255591, + "grad_norm": 0.07272132486104965, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7473 + }, + { + "epoch": 5.9696485623003195, + "grad_norm": 0.1218709796667099, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 7474 + }, + { + "epoch": 5.970447284345048, + "grad_norm": 0.21024082601070404, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7475 + }, + { + "epoch": 5.9712460063897765, + "grad_norm": 0.08869504183530807, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7476 + }, + { + "epoch": 5.972044728434505, + "grad_norm": 0.05930836871266365, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7477 + }, + { + "epoch": 5.972843450479234, + "grad_norm": 0.10009569674730301, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7478 + }, + { + "epoch": 5.973642172523961, + "grad_norm": 0.2543089687824249, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7479 + }, + { + "epoch": 5.97444089456869, + "grad_norm": 0.04702993854880333, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7480 + }, + { + "epoch": 5.975239616613418, + "grad_norm": 0.12841154634952545, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7481 + }, + { + "epoch": 5.976038338658147, + "grad_norm": 0.10137920081615448, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7482 + }, + { + "epoch": 5.976837060702875, + "grad_norm": 0.0582512766122818, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7483 + }, + { + "epoch": 5.977635782747604, + "grad_norm": 0.06556501984596252, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7484 + }, + { + "epoch": 5.978434504792332, + "grad_norm": 0.2065235674381256, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7485 + }, + { + "epoch": 5.979233226837061, + "grad_norm": 0.07943716645240784, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7486 + }, + { + "epoch": 5.9800319488817895, + "grad_norm": 0.05257594957947731, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 7487 + }, + { + "epoch": 5.980830670926517, + "grad_norm": 0.06949680298566818, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7488 + }, + { + "epoch": 5.981629392971246, + "grad_norm": 0.0967894196510315, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7489 + }, + { + "epoch": 5.982428115015974, + "grad_norm": 1.068231463432312, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7490 + }, + { + "epoch": 5.983226837060703, + "grad_norm": 0.0648348405957222, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 7491 + }, + { + "epoch": 5.984025559105431, + "grad_norm": 0.2540450096130371, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7492 + }, + { + "epoch": 5.98482428115016, + "grad_norm": 0.1624346375465393, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7493 + }, + { + "epoch": 5.985623003194888, + "grad_norm": 0.10054703056812286, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7494 + }, + { + "epoch": 5.986421725239617, + "grad_norm": 0.05147058889269829, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7495 + }, + { + "epoch": 5.987220447284345, + "grad_norm": 0.10036633163690567, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7496 + }, + { + "epoch": 5.988019169329074, + "grad_norm": 0.14611777663230896, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7497 + }, + { + "epoch": 5.988817891373802, + "grad_norm": 0.12323570251464844, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7498 + }, + { + "epoch": 5.98961661341853, + "grad_norm": 0.04539888724684715, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7499 + }, + { + "epoch": 5.9904153354632586, + "grad_norm": 0.14555387198925018, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7500 + }, + { + "epoch": 5.991214057507987, + "grad_norm": 0.3205990195274353, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7501 + }, + { + "epoch": 5.992012779552716, + "grad_norm": 0.22900770604610443, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7502 + }, + { + "epoch": 5.992811501597444, + "grad_norm": 0.11138728260993958, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7503 + }, + { + "epoch": 5.993610223642173, + "grad_norm": 0.09425637125968933, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7504 + }, + { + "epoch": 5.994408945686901, + "grad_norm": 0.18409870564937592, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7505 + }, + { + "epoch": 5.99520766773163, + "grad_norm": 0.1610010713338852, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7506 + }, + { + "epoch": 5.996006389776358, + "grad_norm": 0.2304852306842804, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7507 + }, + { + "epoch": 5.996805111821086, + "grad_norm": 0.09830645471811295, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7508 + }, + { + "epoch": 5.997603833865814, + "grad_norm": 0.12319398671388626, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7509 + }, + { + "epoch": 5.998402555910543, + "grad_norm": 0.07925699651241302, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7510 + }, + { + "epoch": 5.9992012779552715, + "grad_norm": 0.07079242914915085, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7511 + }, + { + "epoch": 6.0, + "grad_norm": 0.14047275483608246, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7512 + }, + { + "epoch": 6.0007987220447285, + "grad_norm": 0.172583669424057, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7513 + }, + { + "epoch": 6.001597444089457, + "grad_norm": 0.3635086119174957, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7514 + }, + { + "epoch": 6.002396166134186, + "grad_norm": 0.14463695883750916, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7515 + }, + { + "epoch": 6.003194888178914, + "grad_norm": 0.24417585134506226, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7516 + }, + { + "epoch": 6.003993610223642, + "grad_norm": 0.25690382719039917, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 7517 + }, + { + "epoch": 6.00479233226837, + "grad_norm": 0.12535394728183746, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 7518 + }, + { + "epoch": 6.005591054313099, + "grad_norm": 0.19279715418815613, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7519 + }, + { + "epoch": 6.006389776357827, + "grad_norm": 0.10537917166948318, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7520 + }, + { + "epoch": 6.007188498402556, + "grad_norm": 0.07752633094787598, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7521 + }, + { + "epoch": 6.007987220447284, + "grad_norm": 0.10693971067667007, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7522 + }, + { + "epoch": 6.008785942492013, + "grad_norm": 0.06399057805538177, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7523 + }, + { + "epoch": 6.0095846645367414, + "grad_norm": 0.12577609717845917, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7524 + }, + { + "epoch": 6.01038338658147, + "grad_norm": 0.12770701944828033, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7525 + }, + { + "epoch": 6.0111821086261985, + "grad_norm": 0.07679085433483124, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7526 + }, + { + "epoch": 6.011980830670926, + "grad_norm": 0.14353524148464203, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 7527 + }, + { + "epoch": 6.012779552715655, + "grad_norm": 0.3428184688091278, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7528 + }, + { + "epoch": 6.013578274760383, + "grad_norm": 0.1436242014169693, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 7529 + }, + { + "epoch": 6.014376996805112, + "grad_norm": 0.07608507573604584, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7530 + }, + { + "epoch": 6.01517571884984, + "grad_norm": 0.10932086408138275, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7531 + }, + { + "epoch": 6.015974440894569, + "grad_norm": 0.07631878554821014, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7532 + }, + { + "epoch": 6.016773162939297, + "grad_norm": 0.0718175396323204, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7533 + }, + { + "epoch": 6.017571884984026, + "grad_norm": 0.07661164551973343, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7534 + }, + { + "epoch": 6.018370607028754, + "grad_norm": 0.10753245651721954, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7535 + }, + { + "epoch": 6.019169329073482, + "grad_norm": 0.12740729749202728, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7536 + }, + { + "epoch": 6.0199680511182105, + "grad_norm": 0.14345388114452362, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 7537 + }, + { + "epoch": 6.020766773162939, + "grad_norm": 0.13860031962394714, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7538 + }, + { + "epoch": 6.021565495207668, + "grad_norm": 0.07766555994749069, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7539 + }, + { + "epoch": 6.022364217252396, + "grad_norm": 0.11253347247838974, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7540 + }, + { + "epoch": 6.023162939297125, + "grad_norm": 0.18870452046394348, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7541 + }, + { + "epoch": 6.023961661341853, + "grad_norm": 0.12401654571294785, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7542 + }, + { + "epoch": 6.024760383386582, + "grad_norm": 0.08025321364402771, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7543 + }, + { + "epoch": 6.02555910543131, + "grad_norm": 0.12504157423973083, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7544 + }, + { + "epoch": 6.026357827476039, + "grad_norm": 0.07099851220846176, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7545 + }, + { + "epoch": 6.027156549520766, + "grad_norm": 0.09573683142662048, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 7546 + }, + { + "epoch": 6.027955271565495, + "grad_norm": 0.18280553817749023, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 7547 + }, + { + "epoch": 6.0287539936102235, + "grad_norm": 0.15688058733940125, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7548 + }, + { + "epoch": 6.029552715654952, + "grad_norm": 0.11738436669111252, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 7549 + }, + { + "epoch": 6.0303514376996805, + "grad_norm": 1.275103211402893, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7550 + }, + { + "epoch": 6.031150159744409, + "grad_norm": 0.39542102813720703, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 7551 + }, + { + "epoch": 6.031948881789138, + "grad_norm": 0.32140371203422546, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7552 + }, + { + "epoch": 6.032747603833866, + "grad_norm": 0.2855371832847595, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7553 + }, + { + "epoch": 6.033546325878595, + "grad_norm": 0.14987513422966003, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7554 + }, + { + "epoch": 6.034345047923322, + "grad_norm": 0.25978198647499084, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7555 + }, + { + "epoch": 6.035143769968051, + "grad_norm": 0.14043942093849182, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 7556 + }, + { + "epoch": 6.035942492012779, + "grad_norm": 0.16670344769954681, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7557 + }, + { + "epoch": 6.036741214057508, + "grad_norm": 0.1668681800365448, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7558 + }, + { + "epoch": 6.037539936102236, + "grad_norm": 0.11135906726121902, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7559 + }, + { + "epoch": 6.038338658146965, + "grad_norm": 0.26222026348114014, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7560 + }, + { + "epoch": 6.039137380191693, + "grad_norm": 0.1670113205909729, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7561 + }, + { + "epoch": 6.039936102236422, + "grad_norm": 0.15860766172409058, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7562 + }, + { + "epoch": 6.0407348242811505, + "grad_norm": 0.2577793300151825, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7563 + }, + { + "epoch": 6.041533546325879, + "grad_norm": 0.11147591471672058, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7564 + }, + { + "epoch": 6.042332268370607, + "grad_norm": 0.18452385067939758, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7565 + }, + { + "epoch": 6.043130990415335, + "grad_norm": 0.19697625935077667, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7566 + }, + { + "epoch": 6.043929712460064, + "grad_norm": 0.08586452901363373, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7567 + }, + { + "epoch": 6.044728434504792, + "grad_norm": 0.18721693754196167, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7568 + }, + { + "epoch": 6.045527156549521, + "grad_norm": 0.13190758228302002, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7569 + }, + { + "epoch": 6.046325878594249, + "grad_norm": 0.09424075484275818, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7570 + }, + { + "epoch": 6.047124600638978, + "grad_norm": 0.15252210199832916, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7571 + }, + { + "epoch": 6.047923322683706, + "grad_norm": 0.06378420442342758, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 7572 + }, + { + "epoch": 6.048722044728435, + "grad_norm": 0.07665325701236725, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7573 + }, + { + "epoch": 6.0495207667731625, + "grad_norm": 0.0847245529294014, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7574 + }, + { + "epoch": 6.050319488817891, + "grad_norm": 0.034070566296577454, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7575 + }, + { + "epoch": 6.05111821086262, + "grad_norm": 0.08149915188550949, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7576 + }, + { + "epoch": 6.051916932907348, + "grad_norm": 0.07882412523031235, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7577 + }, + { + "epoch": 6.052715654952077, + "grad_norm": 0.055492956191301346, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 7578 + }, + { + "epoch": 6.053514376996805, + "grad_norm": 0.10246025770902634, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7579 + }, + { + "epoch": 6.054313099041534, + "grad_norm": 0.11067861318588257, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7580 + }, + { + "epoch": 6.055111821086262, + "grad_norm": 0.06063758581876755, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7581 + }, + { + "epoch": 6.055910543130991, + "grad_norm": 0.06848330795764923, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 7582 + }, + { + "epoch": 6.056709265175719, + "grad_norm": 0.10336993634700775, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 7583 + }, + { + "epoch": 6.057507987220447, + "grad_norm": 0.06081530824303627, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7584 + }, + { + "epoch": 6.0583067092651754, + "grad_norm": 0.08049804717302322, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7585 + }, + { + "epoch": 6.059105431309904, + "grad_norm": 0.09174875915050507, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7586 + }, + { + "epoch": 6.0599041533546325, + "grad_norm": 0.06121581420302391, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7587 + }, + { + "epoch": 6.060702875399361, + "grad_norm": 0.10653077065944672, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7588 + }, + { + "epoch": 6.06150159744409, + "grad_norm": 0.0676097571849823, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7589 + }, + { + "epoch": 6.062300319488818, + "grad_norm": 0.0625678300857544, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7590 + }, + { + "epoch": 6.063099041533547, + "grad_norm": 0.07936695963144302, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7591 + }, + { + "epoch": 6.063897763578275, + "grad_norm": 0.06149541214108467, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7592 + }, + { + "epoch": 6.064696485623003, + "grad_norm": 0.04549092426896095, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7593 + }, + { + "epoch": 6.065495207667731, + "grad_norm": 0.06483953446149826, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7594 + }, + { + "epoch": 6.06629392971246, + "grad_norm": 0.04048188030719757, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7595 + }, + { + "epoch": 6.067092651757188, + "grad_norm": 0.038281429558992386, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7596 + }, + { + "epoch": 6.067891373801917, + "grad_norm": 0.06686673313379288, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7597 + }, + { + "epoch": 6.068690095846645, + "grad_norm": 0.09025852382183075, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7598 + }, + { + "epoch": 6.069488817891374, + "grad_norm": 0.07517793774604797, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7599 + }, + { + "epoch": 6.0702875399361025, + "grad_norm": 0.06342573463916779, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7600 + }, + { + "epoch": 6.071086261980831, + "grad_norm": 0.08630760759115219, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7601 + }, + { + "epoch": 6.0718849840255595, + "grad_norm": 0.06443625688552856, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7602 + }, + { + "epoch": 6.072683706070287, + "grad_norm": 0.08748311549425125, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7603 + }, + { + "epoch": 6.073482428115016, + "grad_norm": 0.051623452454805374, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 7604 + }, + { + "epoch": 6.074281150159744, + "grad_norm": 0.09098891913890839, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7605 + }, + { + "epoch": 6.075079872204473, + "grad_norm": 0.14741428196430206, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7606 + }, + { + "epoch": 6.075878594249201, + "grad_norm": 0.064545176923275, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7607 + }, + { + "epoch": 6.07667731629393, + "grad_norm": 0.09775100648403168, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7608 + }, + { + "epoch": 6.077476038338658, + "grad_norm": 0.14192643761634827, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7609 + }, + { + "epoch": 6.078274760383387, + "grad_norm": 0.05390379950404167, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7610 + }, + { + "epoch": 6.079073482428115, + "grad_norm": 0.35628536343574524, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7611 + }, + { + "epoch": 6.079872204472843, + "grad_norm": 0.11727920919656754, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 7612 + }, + { + "epoch": 6.080670926517572, + "grad_norm": 0.053165338933467865, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7613 + }, + { + "epoch": 6.0814696485623, + "grad_norm": 0.12718519568443298, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7614 + }, + { + "epoch": 6.082268370607029, + "grad_norm": 0.12406741827726364, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7615 + }, + { + "epoch": 6.083067092651757, + "grad_norm": 0.05323740839958191, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7616 + }, + { + "epoch": 6.083865814696486, + "grad_norm": 0.09811960160732269, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 7617 + }, + { + "epoch": 6.084664536741214, + "grad_norm": 0.12453506886959076, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 7618 + }, + { + "epoch": 6.085463258785943, + "grad_norm": 0.13459496200084686, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7619 + }, + { + "epoch": 6.086261980830671, + "grad_norm": 0.20130378007888794, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7620 + }, + { + "epoch": 6.0870607028754, + "grad_norm": 0.11361974477767944, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7621 + }, + { + "epoch": 6.087859424920127, + "grad_norm": 0.07432135194540024, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7622 + }, + { + "epoch": 6.088658146964856, + "grad_norm": 0.14522314071655273, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7623 + }, + { + "epoch": 6.0894568690095845, + "grad_norm": 0.050937261432409286, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7624 + }, + { + "epoch": 6.090255591054313, + "grad_norm": 0.12386021763086319, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7625 + }, + { + "epoch": 6.0910543130990416, + "grad_norm": 0.1498231738805771, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7626 + }, + { + "epoch": 6.09185303514377, + "grad_norm": 0.042041294276714325, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7627 + }, + { + "epoch": 6.092651757188499, + "grad_norm": 0.1103961393237114, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7628 + }, + { + "epoch": 6.093450479233227, + "grad_norm": 0.12362606078386307, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7629 + }, + { + "epoch": 6.094249201277956, + "grad_norm": 0.07069346308708191, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7630 + }, + { + "epoch": 6.095047923322683, + "grad_norm": 0.1306593418121338, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7631 + }, + { + "epoch": 6.095846645367412, + "grad_norm": 0.11293961852788925, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7632 + }, + { + "epoch": 6.09664536741214, + "grad_norm": 0.07145176827907562, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7633 + }, + { + "epoch": 6.097444089456869, + "grad_norm": 0.11122562736272812, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7634 + }, + { + "epoch": 6.098242811501597, + "grad_norm": 0.039713576436042786, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7635 + }, + { + "epoch": 6.099041533546326, + "grad_norm": 0.11573004722595215, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7636 + }, + { + "epoch": 6.0998402555910545, + "grad_norm": 0.11995833367109299, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7637 + }, + { + "epoch": 6.100638977635783, + "grad_norm": 0.03895663470029831, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 7638 + }, + { + "epoch": 6.1014376996805115, + "grad_norm": 0.11274216324090958, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7639 + }, + { + "epoch": 6.102236421725239, + "grad_norm": 0.14242613315582275, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7640 + }, + { + "epoch": 6.103035143769968, + "grad_norm": 0.04954848438501358, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7641 + }, + { + "epoch": 6.103833865814696, + "grad_norm": 0.10814809799194336, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7642 + }, + { + "epoch": 6.104632587859425, + "grad_norm": 0.11696363240480423, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7643 + }, + { + "epoch": 6.105431309904153, + "grad_norm": 0.04597959294915199, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7644 + }, + { + "epoch": 6.106230031948882, + "grad_norm": 0.16304457187652588, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 7645 + }, + { + "epoch": 6.10702875399361, + "grad_norm": 0.14835208654403687, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7646 + }, + { + "epoch": 6.107827476038339, + "grad_norm": 0.06062949076294899, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7647 + }, + { + "epoch": 6.108626198083067, + "grad_norm": 0.1033453568816185, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7648 + }, + { + "epoch": 6.109424920127796, + "grad_norm": 0.14823280274868011, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7649 + }, + { + "epoch": 6.110223642172524, + "grad_norm": 0.18282924592494965, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7650 + }, + { + "epoch": 6.111022364217252, + "grad_norm": 0.17962203919887543, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7651 + }, + { + "epoch": 6.111821086261981, + "grad_norm": 0.12176015228033066, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7652 + }, + { + "epoch": 6.112619808306709, + "grad_norm": 0.07326921075582504, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7653 + }, + { + "epoch": 6.113418530351438, + "grad_norm": 0.24457645416259766, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7654 + }, + { + "epoch": 6.114217252396166, + "grad_norm": 0.1442916989326477, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 7655 + }, + { + "epoch": 6.115015974440895, + "grad_norm": 0.0716436356306076, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7656 + }, + { + "epoch": 6.115814696485623, + "grad_norm": 0.20782648026943207, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7657 + }, + { + "epoch": 6.116613418530352, + "grad_norm": 0.1183728352189064, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7658 + }, + { + "epoch": 6.11741214057508, + "grad_norm": 0.13251493871212006, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7659 + }, + { + "epoch": 6.118210862619808, + "grad_norm": 0.21223802864551544, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7660 + }, + { + "epoch": 6.1190095846645365, + "grad_norm": 0.0811460018157959, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7661 + }, + { + "epoch": 6.119808306709265, + "grad_norm": 0.13528718054294586, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7662 + }, + { + "epoch": 6.1206070287539935, + "grad_norm": 0.11806038022041321, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7663 + }, + { + "epoch": 6.121405750798722, + "grad_norm": 0.10022544860839844, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7664 + }, + { + "epoch": 6.122204472843451, + "grad_norm": 0.21452540159225464, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7665 + }, + { + "epoch": 6.123003194888179, + "grad_norm": 0.11949847638607025, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7666 + }, + { + "epoch": 6.123801916932908, + "grad_norm": 0.12636634707450867, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7667 + }, + { + "epoch": 6.124600638977636, + "grad_norm": 0.17132572829723358, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7668 + }, + { + "epoch": 6.125399361022364, + "grad_norm": 0.1116800457239151, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7669 + }, + { + "epoch": 6.126198083067092, + "grad_norm": 0.13965120911598206, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 7670 + }, + { + "epoch": 6.126996805111821, + "grad_norm": 0.1346610188484192, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7671 + }, + { + "epoch": 6.127795527156549, + "grad_norm": 0.07977228611707687, + "learning_rate": 0.0005, + "loss": 1.0466, + "step": 7672 + }, + { + "epoch": 6.128594249201278, + "grad_norm": 0.21412506699562073, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7673 + }, + { + "epoch": 6.1293929712460065, + "grad_norm": 0.172305628657341, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7674 + }, + { + "epoch": 6.130191693290735, + "grad_norm": 0.10782980173826218, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7675 + }, + { + "epoch": 6.1309904153354635, + "grad_norm": 0.23166432976722717, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 7676 + }, + { + "epoch": 6.131789137380192, + "grad_norm": 0.12337028980255127, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7677 + }, + { + "epoch": 6.13258785942492, + "grad_norm": 0.11406251043081284, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7678 + }, + { + "epoch": 6.133386581469648, + "grad_norm": 0.19163282215595245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7679 + }, + { + "epoch": 6.134185303514377, + "grad_norm": 0.06671248376369476, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7680 + }, + { + "epoch": 6.134984025559105, + "grad_norm": 0.13190557062625885, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7681 + }, + { + "epoch": 6.135782747603834, + "grad_norm": 0.20761321485042572, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7682 + }, + { + "epoch": 6.136581469648562, + "grad_norm": 0.08118047565221786, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7683 + }, + { + "epoch": 6.137380191693291, + "grad_norm": 0.1458984613418579, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7684 + }, + { + "epoch": 6.138178913738019, + "grad_norm": 0.1305929571390152, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7685 + }, + { + "epoch": 6.138977635782748, + "grad_norm": 0.0972108244895935, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7686 + }, + { + "epoch": 6.139776357827476, + "grad_norm": 0.14246216416358948, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7687 + }, + { + "epoch": 6.140575079872204, + "grad_norm": 0.04341820999979973, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7688 + }, + { + "epoch": 6.141373801916933, + "grad_norm": 0.127020001411438, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7689 + }, + { + "epoch": 6.142172523961661, + "grad_norm": 0.08494339138269424, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 7690 + }, + { + "epoch": 6.14297124600639, + "grad_norm": 0.11377454549074173, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7691 + }, + { + "epoch": 6.143769968051118, + "grad_norm": 0.13752779364585876, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 7692 + }, + { + "epoch": 6.144568690095847, + "grad_norm": 0.054878801107406616, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7693 + }, + { + "epoch": 6.145367412140575, + "grad_norm": 0.11313790827989578, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7694 + }, + { + "epoch": 6.146166134185304, + "grad_norm": 0.04388728365302086, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7695 + }, + { + "epoch": 6.146964856230032, + "grad_norm": 0.12842994928359985, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 7696 + }, + { + "epoch": 6.147763578274761, + "grad_norm": 0.1374971568584442, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7697 + }, + { + "epoch": 6.1485623003194885, + "grad_norm": 0.1082429438829422, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7698 + }, + { + "epoch": 6.149361022364217, + "grad_norm": 0.14329178631305695, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7699 + }, + { + "epoch": 6.1501597444089455, + "grad_norm": 0.07794678211212158, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7700 + }, + { + "epoch": 6.150958466453674, + "grad_norm": 0.10680928826332092, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7701 + }, + { + "epoch": 6.151757188498403, + "grad_norm": 0.11628691852092743, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7702 + }, + { + "epoch": 6.152555910543131, + "grad_norm": 0.03565143793821335, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7703 + }, + { + "epoch": 6.15335463258786, + "grad_norm": 0.10634133219718933, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 7704 + }, + { + "epoch": 6.154153354632588, + "grad_norm": 0.10307054221630096, + "learning_rate": 0.0005, + "loss": 1.0416, + "step": 7705 + }, + { + "epoch": 6.154952076677317, + "grad_norm": 0.05591967701911926, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 7706 + }, + { + "epoch": 6.155750798722044, + "grad_norm": 0.07205721735954285, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7707 + }, + { + "epoch": 6.156549520766773, + "grad_norm": 0.05020968243479729, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 7708 + }, + { + "epoch": 6.157348242811501, + "grad_norm": 0.037087470293045044, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7709 + }, + { + "epoch": 6.15814696485623, + "grad_norm": 0.06322529166936874, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7710 + }, + { + "epoch": 6.1589456869009584, + "grad_norm": 0.03881093114614487, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 7711 + }, + { + "epoch": 6.159744408945687, + "grad_norm": 0.06219052895903587, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7712 + }, + { + "epoch": 6.1605431309904155, + "grad_norm": 0.043313659727573395, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7713 + }, + { + "epoch": 6.161341853035144, + "grad_norm": 0.05460439994931221, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7714 + }, + { + "epoch": 6.162140575079873, + "grad_norm": 0.045017000287771225, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7715 + }, + { + "epoch": 6.1629392971246, + "grad_norm": 0.08029863983392715, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7716 + }, + { + "epoch": 6.163738019169329, + "grad_norm": 0.06935936212539673, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 7717 + }, + { + "epoch": 6.164536741214057, + "grad_norm": 0.12617695331573486, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 7718 + }, + { + "epoch": 6.165335463258786, + "grad_norm": 0.09746283292770386, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 7719 + }, + { + "epoch": 6.166134185303514, + "grad_norm": 0.038731649518013, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7720 + }, + { + "epoch": 6.166932907348243, + "grad_norm": 0.1054256334900856, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7721 + }, + { + "epoch": 6.167731629392971, + "grad_norm": 0.0833977535367012, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7722 + }, + { + "epoch": 6.1685303514377, + "grad_norm": 1.3529000282287598, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7723 + }, + { + "epoch": 6.169329073482428, + "grad_norm": 0.06748781353235245, + "learning_rate": 0.0005, + "loss": 1.0437, + "step": 7724 + }, + { + "epoch": 6.170127795527157, + "grad_norm": 0.06015792861580849, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7725 + }, + { + "epoch": 6.170926517571885, + "grad_norm": 0.07760192453861237, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7726 + }, + { + "epoch": 6.171725239616613, + "grad_norm": 0.09536328911781311, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7727 + }, + { + "epoch": 6.172523961661342, + "grad_norm": 0.051248203963041306, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7728 + }, + { + "epoch": 6.17332268370607, + "grad_norm": 0.09610000252723694, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7729 + }, + { + "epoch": 6.174121405750799, + "grad_norm": 0.0803515687584877, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7730 + }, + { + "epoch": 6.174920127795527, + "grad_norm": 0.0820179283618927, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7731 + }, + { + "epoch": 6.175718849840256, + "grad_norm": 0.08880780637264252, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 7732 + }, + { + "epoch": 6.176517571884984, + "grad_norm": 0.12188591808080673, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 7733 + }, + { + "epoch": 6.177316293929713, + "grad_norm": 0.06245967745780945, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 7734 + }, + { + "epoch": 6.178115015974441, + "grad_norm": 0.06608586013317108, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 7735 + }, + { + "epoch": 6.178913738019169, + "grad_norm": 0.08542132377624512, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 7736 + }, + { + "epoch": 6.1797124600638975, + "grad_norm": 0.06510723382234573, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7737 + }, + { + "epoch": 6.180511182108626, + "grad_norm": 0.161012202501297, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7738 + }, + { + "epoch": 6.181309904153355, + "grad_norm": 0.07943159341812134, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7739 + }, + { + "epoch": 6.182108626198083, + "grad_norm": 0.07735269516706467, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7740 + }, + { + "epoch": 6.182907348242812, + "grad_norm": 0.07452470809221268, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7741 + }, + { + "epoch": 6.18370607028754, + "grad_norm": 0.06378357857465744, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7742 + }, + { + "epoch": 6.184504792332269, + "grad_norm": 0.06149968132376671, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7743 + }, + { + "epoch": 6.185303514376997, + "grad_norm": 0.06558738648891449, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7744 + }, + { + "epoch": 6.186102236421725, + "grad_norm": 0.06004631146788597, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7745 + }, + { + "epoch": 6.186900958466453, + "grad_norm": 0.09972328692674637, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 7746 + }, + { + "epoch": 6.187699680511182, + "grad_norm": 0.059344276785850525, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 7747 + }, + { + "epoch": 6.18849840255591, + "grad_norm": 0.15083496272563934, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7748 + }, + { + "epoch": 6.189297124600639, + "grad_norm": 0.08041606843471527, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 7749 + }, + { + "epoch": 6.1900958466453675, + "grad_norm": 0.0801318883895874, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 7750 + }, + { + "epoch": 6.190894568690096, + "grad_norm": 0.13313926756381989, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 7751 + }, + { + "epoch": 6.1916932907348246, + "grad_norm": 0.07887420803308487, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7752 + }, + { + "epoch": 6.192492012779553, + "grad_norm": 0.08653397113084793, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7753 + }, + { + "epoch": 6.193290734824281, + "grad_norm": 0.12184617668390274, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7754 + }, + { + "epoch": 6.194089456869009, + "grad_norm": 0.05356535315513611, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7755 + }, + { + "epoch": 6.194888178913738, + "grad_norm": 0.09529519081115723, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7756 + }, + { + "epoch": 6.195686900958466, + "grad_norm": 0.07658126950263977, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 7757 + }, + { + "epoch": 6.196485623003195, + "grad_norm": 0.0785149484872818, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 7758 + }, + { + "epoch": 6.197284345047923, + "grad_norm": 0.10748651623725891, + "learning_rate": 0.0005, + "loss": 1.0449, + "step": 7759 + }, + { + "epoch": 6.198083067092652, + "grad_norm": 0.056907687336206436, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7760 + }, + { + "epoch": 6.19888178913738, + "grad_norm": 0.3713622987270355, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7761 + }, + { + "epoch": 6.199680511182109, + "grad_norm": 0.16671019792556763, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7762 + }, + { + "epoch": 6.2004792332268375, + "grad_norm": 0.10214395076036453, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7763 + }, + { + "epoch": 6.201277955271565, + "grad_norm": 0.09181013703346252, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7764 + }, + { + "epoch": 6.202076677316294, + "grad_norm": 0.18003405630588531, + "learning_rate": 0.0005, + "loss": 1.0419, + "step": 7765 + }, + { + "epoch": 6.202875399361022, + "grad_norm": 0.1032429188489914, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 7766 + }, + { + "epoch": 6.203674121405751, + "grad_norm": 0.06787005811929703, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7767 + }, + { + "epoch": 6.204472843450479, + "grad_norm": 0.09422674775123596, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7768 + }, + { + "epoch": 6.205271565495208, + "grad_norm": 0.04083932563662529, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7769 + }, + { + "epoch": 6.206070287539936, + "grad_norm": 0.1368017941713333, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7770 + }, + { + "epoch": 6.206869009584665, + "grad_norm": 0.23276877403259277, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7771 + }, + { + "epoch": 6.207667731629393, + "grad_norm": 0.13092860579490662, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7772 + }, + { + "epoch": 6.208466453674121, + "grad_norm": 0.14030441641807556, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7773 + }, + { + "epoch": 6.2092651757188495, + "grad_norm": 0.2016047090291977, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 7774 + }, + { + "epoch": 6.210063897763578, + "grad_norm": 0.1224871277809143, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7775 + }, + { + "epoch": 6.210862619808307, + "grad_norm": 0.10741977393627167, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7776 + }, + { + "epoch": 6.211661341853035, + "grad_norm": 0.19775021076202393, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7777 + }, + { + "epoch": 6.212460063897764, + "grad_norm": 0.06731278449296951, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7778 + }, + { + "epoch": 6.213258785942492, + "grad_norm": 0.14070862531661987, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 7779 + }, + { + "epoch": 6.214057507987221, + "grad_norm": 0.1267949938774109, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 7780 + }, + { + "epoch": 6.214856230031949, + "grad_norm": 0.0694371834397316, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 7781 + }, + { + "epoch": 6.215654952076678, + "grad_norm": 0.12222267687320709, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 7782 + }, + { + "epoch": 6.216453674121405, + "grad_norm": 0.1105445921421051, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7783 + }, + { + "epoch": 6.217252396166134, + "grad_norm": 0.05993608012795448, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7784 + }, + { + "epoch": 6.218051118210862, + "grad_norm": 0.11157821118831635, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7785 + }, + { + "epoch": 6.218849840255591, + "grad_norm": 0.05242336913943291, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7786 + }, + { + "epoch": 6.2196485623003195, + "grad_norm": 0.046115025877952576, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7787 + }, + { + "epoch": 6.220447284345048, + "grad_norm": 0.04029909893870354, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7788 + }, + { + "epoch": 6.2212460063897765, + "grad_norm": 0.057172924280166626, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7789 + }, + { + "epoch": 6.222044728434505, + "grad_norm": 0.04958837479352951, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7790 + }, + { + "epoch": 6.222843450479234, + "grad_norm": 0.046313852071762085, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7791 + }, + { + "epoch": 6.223642172523961, + "grad_norm": 0.03824630752205849, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7792 + }, + { + "epoch": 6.22444089456869, + "grad_norm": 0.07159019261598587, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7793 + }, + { + "epoch": 6.225239616613418, + "grad_norm": 0.06316389888525009, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 7794 + }, + { + "epoch": 6.226038338658147, + "grad_norm": 0.088447704911232, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7795 + }, + { + "epoch": 6.226837060702875, + "grad_norm": 0.08749943226575851, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7796 + }, + { + "epoch": 6.227635782747604, + "grad_norm": 0.08757520467042923, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 7797 + }, + { + "epoch": 6.228434504792332, + "grad_norm": 0.10777202993631363, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7798 + }, + { + "epoch": 6.229233226837061, + "grad_norm": 0.15780584514141083, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7799 + }, + { + "epoch": 6.2300319488817895, + "grad_norm": 0.10375814139842987, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7800 + }, + { + "epoch": 6.230830670926518, + "grad_norm": 0.3544321656227112, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7801 + }, + { + "epoch": 6.231629392971246, + "grad_norm": 0.11117644608020782, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7802 + }, + { + "epoch": 6.232428115015974, + "grad_norm": 0.13096286356449127, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7803 + }, + { + "epoch": 6.233226837060703, + "grad_norm": 0.2706630229949951, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 7804 + }, + { + "epoch": 6.234025559105431, + "grad_norm": 0.05805981904268265, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 7805 + }, + { + "epoch": 6.23482428115016, + "grad_norm": 0.14731241762638092, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7806 + }, + { + "epoch": 6.235623003194888, + "grad_norm": 0.08912478387355804, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 7807 + }, + { + "epoch": 6.236421725239617, + "grad_norm": 0.15754206478595734, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7808 + }, + { + "epoch": 6.237220447284345, + "grad_norm": 0.21143318712711334, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7809 + }, + { + "epoch": 6.238019169329074, + "grad_norm": 0.11839418858289719, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7810 + }, + { + "epoch": 6.2388178913738015, + "grad_norm": 0.23939856886863708, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7811 + }, + { + "epoch": 6.23961661341853, + "grad_norm": 0.1438305526971817, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7812 + }, + { + "epoch": 6.2404153354632586, + "grad_norm": 0.11111237108707428, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7813 + }, + { + "epoch": 6.241214057507987, + "grad_norm": 0.19577394425868988, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7814 + }, + { + "epoch": 6.242012779552716, + "grad_norm": 0.1399260312318802, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7815 + }, + { + "epoch": 6.242811501597444, + "grad_norm": 0.16393627226352692, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 7816 + }, + { + "epoch": 6.243610223642173, + "grad_norm": 0.15071940422058105, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 7817 + }, + { + "epoch": 6.244408945686901, + "grad_norm": 0.2121957242488861, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7818 + }, + { + "epoch": 6.24520766773163, + "grad_norm": 0.09854442626237869, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7819 + }, + { + "epoch": 6.246006389776358, + "grad_norm": 0.1327667534351349, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7820 + }, + { + "epoch": 6.246805111821086, + "grad_norm": 0.13909243047237396, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7821 + }, + { + "epoch": 6.247603833865814, + "grad_norm": 0.08482292294502258, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7822 + }, + { + "epoch": 6.248402555910543, + "grad_norm": 0.0918656438589096, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 7823 + }, + { + "epoch": 6.2492012779552715, + "grad_norm": 0.1352611631155014, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7824 + }, + { + "epoch": 6.25, + "grad_norm": 0.06178867816925049, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7825 + }, + { + "epoch": 6.2507987220447285, + "grad_norm": 0.1285342425107956, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7826 + }, + { + "epoch": 6.251597444089457, + "grad_norm": 0.17862951755523682, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 7827 + }, + { + "epoch": 6.252396166134186, + "grad_norm": 0.574928343296051, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 7828 + }, + { + "epoch": 6.253194888178914, + "grad_norm": 0.11522867530584335, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 7829 + }, + { + "epoch": 6.253993610223642, + "grad_norm": 0.08348001539707184, + "learning_rate": 0.0005, + "loss": 1.0422, + "step": 7830 + }, + { + "epoch": 6.25479233226837, + "grad_norm": 0.1015007346868515, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7831 + }, + { + "epoch": 6.255591054313099, + "grad_norm": 0.18213561177253723, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7832 + }, + { + "epoch": 6.256389776357827, + "grad_norm": 0.1056833565235138, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 7833 + }, + { + "epoch": 6.257188498402556, + "grad_norm": 0.09715890139341354, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7834 + }, + { + "epoch": 6.257987220447284, + "grad_norm": 0.17651355266571045, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7835 + }, + { + "epoch": 6.258785942492013, + "grad_norm": 0.11858265846967697, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7836 + }, + { + "epoch": 6.2595846645367414, + "grad_norm": 0.1400168240070343, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 7837 + }, + { + "epoch": 6.26038338658147, + "grad_norm": 0.2133244276046753, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7838 + }, + { + "epoch": 6.261182108626198, + "grad_norm": 0.087309330701828, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7839 + }, + { + "epoch": 6.261980830670926, + "grad_norm": 0.07735110074281693, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7840 + }, + { + "epoch": 6.262779552715655, + "grad_norm": 0.08314932882785797, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7841 + }, + { + "epoch": 6.263578274760383, + "grad_norm": 0.13448217511177063, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 7842 + }, + { + "epoch": 6.264376996805112, + "grad_norm": 1.4022712707519531, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7843 + }, + { + "epoch": 6.26517571884984, + "grad_norm": 0.1107354387640953, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 7844 + }, + { + "epoch": 6.265974440894569, + "grad_norm": 0.17282478511333466, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7845 + }, + { + "epoch": 6.266773162939297, + "grad_norm": 0.0903516560792923, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 7846 + }, + { + "epoch": 6.267571884984026, + "grad_norm": 0.07628770172595978, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7847 + }, + { + "epoch": 6.268370607028754, + "grad_norm": 0.08877440541982651, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 7848 + }, + { + "epoch": 6.269169329073483, + "grad_norm": 0.041159700602293015, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 7849 + }, + { + "epoch": 6.2699680511182105, + "grad_norm": 0.09187504649162292, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7850 + }, + { + "epoch": 6.270766773162939, + "grad_norm": 0.11252478510141373, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 7851 + }, + { + "epoch": 6.271565495207668, + "grad_norm": 0.04354100301861763, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7852 + }, + { + "epoch": 6.272364217252396, + "grad_norm": 0.06845738738775253, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 7853 + }, + { + "epoch": 6.273162939297125, + "grad_norm": 0.047235157340765, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7854 + }, + { + "epoch": 6.273961661341853, + "grad_norm": 0.04571741819381714, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 7855 + }, + { + "epoch": 6.274760383386582, + "grad_norm": 0.09801016747951508, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 7856 + }, + { + "epoch": 6.27555910543131, + "grad_norm": 0.12422922253608704, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 7857 + }, + { + "epoch": 6.276357827476039, + "grad_norm": 0.07283129543066025, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7858 + }, + { + "epoch": 6.277156549520766, + "grad_norm": 0.07217510044574738, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7859 + }, + { + "epoch": 6.277955271565495, + "grad_norm": 0.1102033257484436, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7860 + }, + { + "epoch": 6.2787539936102235, + "grad_norm": 0.0814276710152626, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 7861 + }, + { + "epoch": 6.279552715654952, + "grad_norm": 0.08247577399015427, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 7862 + }, + { + "epoch": 6.2803514376996805, + "grad_norm": 0.04042622447013855, + "learning_rate": 0.0005, + "loss": 1.0448, + "step": 7863 + }, + { + "epoch": 6.281150159744409, + "grad_norm": 0.049153268337249756, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 7864 + }, + { + "epoch": 6.281948881789138, + "grad_norm": 0.07062675058841705, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7865 + }, + { + "epoch": 6.282747603833866, + "grad_norm": 0.06458686292171478, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7866 + }, + { + "epoch": 6.283546325878595, + "grad_norm": 0.093512162566185, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7867 + }, + { + "epoch": 6.284345047923322, + "grad_norm": 0.054384954273700714, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7868 + }, + { + "epoch": 6.285143769968051, + "grad_norm": 0.06253736466169357, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7869 + }, + { + "epoch": 6.285942492012779, + "grad_norm": 0.05566808953881264, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 7870 + }, + { + "epoch": 6.286741214057508, + "grad_norm": 0.07693472504615784, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7871 + }, + { + "epoch": 6.287539936102236, + "grad_norm": 0.04471312463283539, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7872 + }, + { + "epoch": 6.288338658146965, + "grad_norm": 0.050770796835422516, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7873 + }, + { + "epoch": 6.289137380191693, + "grad_norm": 0.04736769199371338, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7874 + }, + { + "epoch": 6.289936102236422, + "grad_norm": 0.06550426036119461, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7875 + }, + { + "epoch": 6.2907348242811505, + "grad_norm": 0.0524384006857872, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 7876 + }, + { + "epoch": 6.291533546325878, + "grad_norm": 0.10091802477836609, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7877 + }, + { + "epoch": 6.292332268370607, + "grad_norm": 0.14296530187129974, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7878 + }, + { + "epoch": 6.293130990415335, + "grad_norm": 0.08703069388866425, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 7879 + }, + { + "epoch": 6.293929712460064, + "grad_norm": 0.05628393217921257, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 7880 + }, + { + "epoch": 6.294728434504792, + "grad_norm": 0.09164825826883316, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7881 + }, + { + "epoch": 6.295527156549521, + "grad_norm": 0.09182474762201309, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7882 + }, + { + "epoch": 6.296325878594249, + "grad_norm": 0.03495810180902481, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7883 + }, + { + "epoch": 6.297124600638978, + "grad_norm": 0.07738466560840607, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7884 + }, + { + "epoch": 6.297923322683706, + "grad_norm": 0.06034242361783981, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 7885 + }, + { + "epoch": 6.298722044728435, + "grad_norm": 0.04083844646811485, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 7886 + }, + { + "epoch": 6.2995207667731625, + "grad_norm": 0.0918336734175682, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 7887 + }, + { + "epoch": 6.300319488817891, + "grad_norm": 0.07351864874362946, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7888 + }, + { + "epoch": 6.30111821086262, + "grad_norm": 0.042986564338207245, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 7889 + }, + { + "epoch": 6.301916932907348, + "grad_norm": 0.05983031541109085, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7890 + }, + { + "epoch": 6.302715654952077, + "grad_norm": 0.10980594903230667, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7891 + }, + { + "epoch": 6.303514376996805, + "grad_norm": 0.04517138749361038, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 7892 + }, + { + "epoch": 6.304313099041534, + "grad_norm": 0.08489427715539932, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7893 + }, + { + "epoch": 6.305111821086262, + "grad_norm": 0.040421262383461, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7894 + }, + { + "epoch": 6.305910543130991, + "grad_norm": 0.0438009649515152, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 7895 + }, + { + "epoch": 6.306709265175719, + "grad_norm": 0.05797100067138672, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7896 + }, + { + "epoch": 6.307507987220447, + "grad_norm": 0.08798980712890625, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 7897 + }, + { + "epoch": 6.3083067092651754, + "grad_norm": 0.0502130500972271, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 7898 + }, + { + "epoch": 6.309105431309904, + "grad_norm": 0.11610639840364456, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 7899 + }, + { + "epoch": 6.3099041533546325, + "grad_norm": 0.061168819665908813, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7900 + }, + { + "epoch": 6.310702875399361, + "grad_norm": 0.0469425804913044, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7901 + }, + { + "epoch": 6.31150159744409, + "grad_norm": 0.0483059324324131, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 7902 + }, + { + "epoch": 6.312300319488818, + "grad_norm": 0.120233453810215, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7903 + }, + { + "epoch": 6.313099041533547, + "grad_norm": 0.10025710612535477, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 7904 + }, + { + "epoch": 6.313897763578275, + "grad_norm": 0.08750995993614197, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 7905 + }, + { + "epoch": 6.314696485623003, + "grad_norm": 0.31308433413505554, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7906 + }, + { + "epoch": 6.315495207667731, + "grad_norm": 0.06390809267759323, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7907 + }, + { + "epoch": 6.31629392971246, + "grad_norm": 0.0657041072845459, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7908 + }, + { + "epoch": 6.317092651757188, + "grad_norm": 0.09626918286085129, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 7909 + }, + { + "epoch": 6.317891373801917, + "grad_norm": 0.05565343424677849, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 7910 + }, + { + "epoch": 6.318690095846645, + "grad_norm": 0.06147831678390503, + "learning_rate": 0.0005, + "loss": 1.0424, + "step": 7911 + }, + { + "epoch": 6.319488817891374, + "grad_norm": 0.08704033493995667, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 7912 + }, + { + "epoch": 6.3202875399361025, + "grad_norm": 0.04405020549893379, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 7913 + }, + { + "epoch": 6.321086261980831, + "grad_norm": 0.07587708532810211, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7914 + }, + { + "epoch": 6.321884984025559, + "grad_norm": 0.05935811623930931, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 7915 + }, + { + "epoch": 6.322683706070287, + "grad_norm": 0.045584313571453094, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 7916 + }, + { + "epoch": 6.323482428115016, + "grad_norm": 0.065196193754673, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 7917 + }, + { + "epoch": 6.324281150159744, + "grad_norm": 0.05996553227305412, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 7918 + }, + { + "epoch": 6.325079872204473, + "grad_norm": 0.04771357774734497, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7919 + }, + { + "epoch": 6.325878594249201, + "grad_norm": 0.05875687673687935, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 7920 + }, + { + "epoch": 6.32667731629393, + "grad_norm": 0.15765227377414703, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 7921 + }, + { + "epoch": 6.327476038338658, + "grad_norm": 0.038563717156648636, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7922 + }, + { + "epoch": 6.328274760383387, + "grad_norm": 0.04321083426475525, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 7923 + }, + { + "epoch": 6.329073482428115, + "grad_norm": 0.04427725449204445, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7924 + }, + { + "epoch": 6.329872204472843, + "grad_norm": 0.06047825515270233, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 7925 + }, + { + "epoch": 6.330670926517572, + "grad_norm": 0.05161035805940628, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 7926 + }, + { + "epoch": 6.3314696485623, + "grad_norm": 0.06512151658535004, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 7927 + }, + { + "epoch": 6.332268370607029, + "grad_norm": 0.05178358778357506, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 7928 + }, + { + "epoch": 6.333067092651757, + "grad_norm": 0.06199260801076889, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 7929 + }, + { + "epoch": 6.333865814696486, + "grad_norm": 0.09948168694972992, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7930 + }, + { + "epoch": 6.334664536741214, + "grad_norm": 0.06568150222301483, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7931 + }, + { + "epoch": 6.335463258785943, + "grad_norm": 0.036642882972955704, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 7932 + }, + { + "epoch": 6.336261980830671, + "grad_norm": 0.04814688116312027, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7933 + }, + { + "epoch": 6.3370607028754, + "grad_norm": 0.03938854858279228, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 7934 + }, + { + "epoch": 6.337859424920127, + "grad_norm": 0.07778320461511612, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7935 + }, + { + "epoch": 6.338658146964856, + "grad_norm": 0.16271090507507324, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7936 + }, + { + "epoch": 6.3394568690095845, + "grad_norm": 0.3652990460395813, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 7937 + }, + { + "epoch": 6.340255591054313, + "grad_norm": 0.0592365525662899, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 7938 + }, + { + "epoch": 6.3410543130990416, + "grad_norm": 0.28622883558273315, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 7939 + }, + { + "epoch": 6.34185303514377, + "grad_norm": 0.2270730584859848, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 7940 + }, + { + "epoch": 6.342651757188499, + "grad_norm": 0.10781756043434143, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 7941 + }, + { + "epoch": 6.343450479233227, + "grad_norm": 0.11611706018447876, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 7942 + }, + { + "epoch": 6.344249201277956, + "grad_norm": 0.08212626725435257, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7943 + }, + { + "epoch": 6.345047923322683, + "grad_norm": 0.0739196389913559, + "learning_rate": 0.0005, + "loss": 1.0438, + "step": 7944 + }, + { + "epoch": 6.345846645367412, + "grad_norm": 0.1029743030667305, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 7945 + }, + { + "epoch": 6.34664536741214, + "grad_norm": 0.2787686586380005, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7946 + }, + { + "epoch": 6.347444089456869, + "grad_norm": 0.12180152535438538, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 7947 + }, + { + "epoch": 6.348242811501597, + "grad_norm": 0.178681880235672, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 7948 + }, + { + "epoch": 6.349041533546326, + "grad_norm": 0.10219722986221313, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 7949 + }, + { + "epoch": 6.3498402555910545, + "grad_norm": 0.0773158147931099, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 7950 + }, + { + "epoch": 6.350638977635783, + "grad_norm": 0.15096192061901093, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 7951 + }, + { + "epoch": 6.3514376996805115, + "grad_norm": 0.06237277388572693, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 7952 + }, + { + "epoch": 6.352236421725239, + "grad_norm": 1.4819257259368896, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 7953 + }, + { + "epoch": 6.353035143769968, + "grad_norm": 0.09716464579105377, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 7954 + }, + { + "epoch": 6.353833865814696, + "grad_norm": 0.10105668753385544, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 7955 + }, + { + "epoch": 6.354632587859425, + "grad_norm": 0.09361526370048523, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7956 + }, + { + "epoch": 6.355431309904153, + "grad_norm": 0.04209212213754654, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 7957 + }, + { + "epoch": 6.356230031948882, + "grad_norm": 0.11653190106153488, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7958 + }, + { + "epoch": 6.35702875399361, + "grad_norm": 0.1552112102508545, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 7959 + }, + { + "epoch": 6.357827476038339, + "grad_norm": 0.07934660464525223, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 7960 + }, + { + "epoch": 6.358626198083067, + "grad_norm": 0.10928693413734436, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 7961 + }, + { + "epoch": 6.359424920127796, + "grad_norm": 0.15923380851745605, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7962 + }, + { + "epoch": 6.360223642172524, + "grad_norm": 0.12151104211807251, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 7963 + }, + { + "epoch": 6.361022364217252, + "grad_norm": 0.055971868336200714, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 7964 + }, + { + "epoch": 6.361821086261981, + "grad_norm": 0.17611366510391235, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 7965 + }, + { + "epoch": 6.362619808306709, + "grad_norm": 0.16098986566066742, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 7966 + }, + { + "epoch": 6.363418530351438, + "grad_norm": 1.6793769598007202, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 7967 + }, + { + "epoch": 6.364217252396166, + "grad_norm": 0.4322223365306854, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 7968 + }, + { + "epoch": 6.365015974440895, + "grad_norm": 0.35510173439979553, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 7969 + }, + { + "epoch": 6.365814696485623, + "grad_norm": 0.08799898624420166, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 7970 + }, + { + "epoch": 6.366613418530352, + "grad_norm": 0.28774675726890564, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 7971 + }, + { + "epoch": 6.36741214057508, + "grad_norm": 0.28109011054039, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 7972 + }, + { + "epoch": 6.368210862619808, + "grad_norm": 0.09055986255407333, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7973 + }, + { + "epoch": 6.3690095846645365, + "grad_norm": 0.15083353221416473, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 7974 + }, + { + "epoch": 6.369808306709265, + "grad_norm": 0.20686668157577515, + "learning_rate": 0.0005, + "loss": 1.0642, + "step": 7975 + }, + { + "epoch": 6.3706070287539935, + "grad_norm": 0.047575660049915314, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 7976 + }, + { + "epoch": 6.371405750798722, + "grad_norm": 0.25424477458000183, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 7977 + }, + { + "epoch": 6.372204472843451, + "grad_norm": 0.21839222311973572, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 7978 + }, + { + "epoch": 6.373003194888179, + "grad_norm": 0.06493431329727173, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 7979 + }, + { + "epoch": 6.373801916932908, + "grad_norm": 0.2369518280029297, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 7980 + }, + { + "epoch": 6.374600638977636, + "grad_norm": 0.14641214907169342, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 7981 + }, + { + "epoch": 6.375399361022364, + "grad_norm": 0.11602997034788132, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 7982 + }, + { + "epoch": 6.376198083067092, + "grad_norm": 0.18792425096035004, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 7983 + }, + { + "epoch": 6.376996805111821, + "grad_norm": 0.06824373453855515, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 7984 + }, + { + "epoch": 6.377795527156549, + "grad_norm": 0.1228032335639, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7985 + }, + { + "epoch": 6.378594249201278, + "grad_norm": 0.15771286189556122, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 7986 + }, + { + "epoch": 6.3793929712460065, + "grad_norm": 0.1157795861363411, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 7987 + }, + { + "epoch": 6.380191693290735, + "grad_norm": 0.07282877713441849, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 7988 + }, + { + "epoch": 6.3809904153354635, + "grad_norm": 0.10168643295764923, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 7989 + }, + { + "epoch": 6.381789137380192, + "grad_norm": 0.24466580152511597, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 7990 + }, + { + "epoch": 6.38258785942492, + "grad_norm": 0.0972297191619873, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 7991 + }, + { + "epoch": 6.383386581469648, + "grad_norm": 0.08349917083978653, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 7992 + }, + { + "epoch": 6.384185303514377, + "grad_norm": 0.058114584535360336, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 7993 + }, + { + "epoch": 6.384984025559105, + "grad_norm": 0.04745171591639519, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 7994 + }, + { + "epoch": 6.385782747603834, + "grad_norm": 0.05484034866094589, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 7995 + }, + { + "epoch": 6.386581469648562, + "grad_norm": 0.05094960704445839, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 7996 + }, + { + "epoch": 6.387380191693291, + "grad_norm": 0.06368618458509445, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 7997 + }, + { + "epoch": 6.388178913738019, + "grad_norm": 0.07042541354894638, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 7998 + }, + { + "epoch": 6.388977635782748, + "grad_norm": 0.06182365491986275, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 7999 + }, + { + "epoch": 6.389776357827476, + "grad_norm": 0.05778853967785835, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8000 + }, + { + "epoch": 6.390575079872204, + "grad_norm": 0.04334365949034691, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8001 + }, + { + "epoch": 6.391373801916933, + "grad_norm": 0.08214148133993149, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8002 + }, + { + "epoch": 6.392172523961661, + "grad_norm": 0.05468964949250221, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8003 + }, + { + "epoch": 6.39297124600639, + "grad_norm": 0.07484348863363266, + "learning_rate": 0.0005, + "loss": 1.042, + "step": 8004 + }, + { + "epoch": 6.393769968051118, + "grad_norm": 0.04987887665629387, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8005 + }, + { + "epoch": 6.394568690095847, + "grad_norm": 0.05584597587585449, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8006 + }, + { + "epoch": 6.395367412140575, + "grad_norm": 0.07088904082775116, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8007 + }, + { + "epoch": 6.396166134185304, + "grad_norm": 0.26695576310157776, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8008 + }, + { + "epoch": 6.396964856230032, + "grad_norm": 0.06452658027410507, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8009 + }, + { + "epoch": 6.397763578274761, + "grad_norm": 0.08994145691394806, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8010 + }, + { + "epoch": 6.3985623003194885, + "grad_norm": 0.06565240770578384, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8011 + }, + { + "epoch": 6.399361022364217, + "grad_norm": 0.0492648184299469, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8012 + }, + { + "epoch": 6.4001597444089455, + "grad_norm": 0.06946985423564911, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8013 + }, + { + "epoch": 6.400958466453674, + "grad_norm": 0.08669331669807434, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8014 + }, + { + "epoch": 6.401757188498403, + "grad_norm": 0.07930289953947067, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8015 + }, + { + "epoch": 6.402555910543131, + "grad_norm": 0.15216746926307678, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8016 + }, + { + "epoch": 6.40335463258786, + "grad_norm": 0.051862914115190506, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8017 + }, + { + "epoch": 6.404153354632588, + "grad_norm": 0.044119443744421005, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8018 + }, + { + "epoch": 6.404952076677317, + "grad_norm": 0.09787813574075699, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8019 + }, + { + "epoch": 6.405750798722044, + "grad_norm": 0.05269203707575798, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8020 + }, + { + "epoch": 6.406549520766773, + "grad_norm": 0.06683865934610367, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8021 + }, + { + "epoch": 6.407348242811501, + "grad_norm": 0.04334628954529762, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8022 + }, + { + "epoch": 6.40814696485623, + "grad_norm": 0.037559930235147476, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8023 + }, + { + "epoch": 6.4089456869009584, + "grad_norm": 0.21066749095916748, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8024 + }, + { + "epoch": 6.409744408945687, + "grad_norm": 0.05721563845872879, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8025 + }, + { + "epoch": 6.4105431309904155, + "grad_norm": 0.047683823853731155, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8026 + }, + { + "epoch": 6.411341853035144, + "grad_norm": 0.05377231910824776, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8027 + }, + { + "epoch": 6.412140575079873, + "grad_norm": 0.05604357272386551, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8028 + }, + { + "epoch": 6.4129392971246, + "grad_norm": 0.051680225878953934, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8029 + }, + { + "epoch": 6.413738019169329, + "grad_norm": 0.04465701803565025, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8030 + }, + { + "epoch": 6.414536741214057, + "grad_norm": 0.0454387366771698, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8031 + }, + { + "epoch": 6.415335463258786, + "grad_norm": 0.5079139471054077, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8032 + }, + { + "epoch": 6.416134185303514, + "grad_norm": 0.08386353403329849, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8033 + }, + { + "epoch": 6.416932907348243, + "grad_norm": 0.06023477017879486, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8034 + }, + { + "epoch": 6.417731629392971, + "grad_norm": 0.8634743094444275, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8035 + }, + { + "epoch": 6.4185303514377, + "grad_norm": 0.06926131993532181, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8036 + }, + { + "epoch": 6.419329073482428, + "grad_norm": 0.07563464343547821, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8037 + }, + { + "epoch": 6.420127795527157, + "grad_norm": 0.10181237757205963, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8038 + }, + { + "epoch": 6.420926517571885, + "grad_norm": 0.13995511829853058, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8039 + }, + { + "epoch": 6.421725239616613, + "grad_norm": 0.05968187376856804, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8040 + }, + { + "epoch": 6.422523961661342, + "grad_norm": 0.14419680833816528, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8041 + }, + { + "epoch": 6.42332268370607, + "grad_norm": 0.13762469589710236, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8042 + }, + { + "epoch": 6.424121405750799, + "grad_norm": 0.0627644956111908, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8043 + }, + { + "epoch": 6.424920127795527, + "grad_norm": 0.1356768012046814, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8044 + }, + { + "epoch": 6.425718849840256, + "grad_norm": 0.12080833315849304, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8045 + }, + { + "epoch": 6.426517571884984, + "grad_norm": 0.048654112964868546, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8046 + }, + { + "epoch": 6.427316293929713, + "grad_norm": 0.11983022093772888, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8047 + }, + { + "epoch": 6.428115015974441, + "grad_norm": 0.09429550170898438, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8048 + }, + { + "epoch": 6.428913738019169, + "grad_norm": 0.07924454659223557, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8049 + }, + { + "epoch": 6.4297124600638975, + "grad_norm": 0.15244926512241364, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8050 + }, + { + "epoch": 6.430511182108626, + "grad_norm": 0.9872325658798218, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8051 + }, + { + "epoch": 6.431309904153355, + "grad_norm": 0.0790395438671112, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8052 + }, + { + "epoch": 6.432108626198083, + "grad_norm": 0.3828068673610687, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8053 + }, + { + "epoch": 6.432907348242812, + "grad_norm": 0.059630244970321655, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8054 + }, + { + "epoch": 6.43370607028754, + "grad_norm": 0.07113327085971832, + "learning_rate": 0.0005, + "loss": 1.0445, + "step": 8055 + }, + { + "epoch": 6.434504792332269, + "grad_norm": 0.0496523454785347, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8056 + }, + { + "epoch": 6.435303514376997, + "grad_norm": 0.08502436429262161, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8057 + }, + { + "epoch": 6.436102236421725, + "grad_norm": 0.06082376837730408, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8058 + }, + { + "epoch": 6.436900958466453, + "grad_norm": 0.1668524146080017, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8059 + }, + { + "epoch": 6.437699680511182, + "grad_norm": 0.05411513149738312, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8060 + }, + { + "epoch": 6.43849840255591, + "grad_norm": 0.05176519230008125, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8061 + }, + { + "epoch": 6.439297124600639, + "grad_norm": 0.0684237852692604, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8062 + }, + { + "epoch": 6.4400958466453675, + "grad_norm": 0.0715038925409317, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8063 + }, + { + "epoch": 6.440894568690096, + "grad_norm": 0.11311113089323044, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8064 + }, + { + "epoch": 6.4416932907348246, + "grad_norm": 0.06320979446172714, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8065 + }, + { + "epoch": 6.442492012779553, + "grad_norm": 0.09221892803907394, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8066 + }, + { + "epoch": 6.443290734824281, + "grad_norm": 0.1183326244354248, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8067 + }, + { + "epoch": 6.444089456869009, + "grad_norm": 0.08447464555501938, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8068 + }, + { + "epoch": 6.444888178913738, + "grad_norm": 0.21791045367717743, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8069 + }, + { + "epoch": 6.445686900958466, + "grad_norm": 0.055015772581100464, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8070 + }, + { + "epoch": 6.446485623003195, + "grad_norm": 0.13536514341831207, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8071 + }, + { + "epoch": 6.447284345047923, + "grad_norm": 0.16620422899723053, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8072 + }, + { + "epoch": 6.448083067092652, + "grad_norm": 0.08793147653341293, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8073 + }, + { + "epoch": 6.44888178913738, + "grad_norm": 0.0962347462773323, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8074 + }, + { + "epoch": 6.449680511182109, + "grad_norm": 0.08764681965112686, + "learning_rate": 0.0005, + "loss": 1.0463, + "step": 8075 + }, + { + "epoch": 6.4504792332268375, + "grad_norm": 0.06176106259226799, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8076 + }, + { + "epoch": 6.451277955271565, + "grad_norm": 0.06823577731847763, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8077 + }, + { + "epoch": 6.452076677316294, + "grad_norm": 0.11239560693502426, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8078 + }, + { + "epoch": 6.452875399361022, + "grad_norm": 0.10309527069330215, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8079 + }, + { + "epoch": 6.453674121405751, + "grad_norm": 0.07533836364746094, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8080 + }, + { + "epoch": 6.454472843450479, + "grad_norm": 0.06650671362876892, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8081 + }, + { + "epoch": 6.455271565495208, + "grad_norm": 0.1700691431760788, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8082 + }, + { + "epoch": 6.456070287539936, + "grad_norm": 0.06135572865605354, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8083 + }, + { + "epoch": 6.456869009584665, + "grad_norm": 0.08333424478769302, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8084 + }, + { + "epoch": 6.457667731629393, + "grad_norm": 0.1338927149772644, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8085 + }, + { + "epoch": 6.458466453674122, + "grad_norm": 0.07097163796424866, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8086 + }, + { + "epoch": 6.4592651757188495, + "grad_norm": 0.06296008080244064, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8087 + }, + { + "epoch": 6.460063897763578, + "grad_norm": 0.060656916350126266, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8088 + }, + { + "epoch": 6.460862619808307, + "grad_norm": 0.044889576733112335, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8089 + }, + { + "epoch": 6.461661341853035, + "grad_norm": 0.0749807357788086, + "learning_rate": 0.0005, + "loss": 1.0456, + "step": 8090 + }, + { + "epoch": 6.462460063897764, + "grad_norm": 0.07509054243564606, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8091 + }, + { + "epoch": 6.463258785942492, + "grad_norm": 0.054954417049884796, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8092 + }, + { + "epoch": 6.464057507987221, + "grad_norm": 0.05087047815322876, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8093 + }, + { + "epoch": 6.464856230031949, + "grad_norm": 0.12205887585878372, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8094 + }, + { + "epoch": 6.465654952076678, + "grad_norm": 0.08342424035072327, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8095 + }, + { + "epoch": 6.466453674121405, + "grad_norm": 0.12507228553295135, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8096 + }, + { + "epoch": 6.467252396166134, + "grad_norm": 0.10491037368774414, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 8097 + }, + { + "epoch": 6.468051118210862, + "grad_norm": 0.04236119985580444, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8098 + }, + { + "epoch": 6.468849840255591, + "grad_norm": 0.10601458698511124, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8099 + }, + { + "epoch": 6.4696485623003195, + "grad_norm": 0.07485921680927277, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8100 + }, + { + "epoch": 6.470447284345048, + "grad_norm": 0.06351220607757568, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8101 + }, + { + "epoch": 6.4712460063897765, + "grad_norm": 0.08351211249828339, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8102 + }, + { + "epoch": 6.472044728434505, + "grad_norm": 0.07205908000469208, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8103 + }, + { + "epoch": 6.472843450479234, + "grad_norm": 0.07072018831968307, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8104 + }, + { + "epoch": 6.473642172523961, + "grad_norm": 0.0851733461022377, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8105 + }, + { + "epoch": 6.47444089456869, + "grad_norm": 0.07046044617891312, + "learning_rate": 0.0005, + "loss": 1.0459, + "step": 8106 + }, + { + "epoch": 6.475239616613418, + "grad_norm": 0.03804340958595276, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8107 + }, + { + "epoch": 6.476038338658147, + "grad_norm": 0.059083763509988785, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8108 + }, + { + "epoch": 6.476837060702875, + "grad_norm": 0.0419149249792099, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8109 + }, + { + "epoch": 6.477635782747604, + "grad_norm": 0.07814865559339523, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8110 + }, + { + "epoch": 6.478434504792332, + "grad_norm": 0.12653781473636627, + "learning_rate": 0.0005, + "loss": 1.0469, + "step": 8111 + }, + { + "epoch": 6.479233226837061, + "grad_norm": 0.10124429315328598, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8112 + }, + { + "epoch": 6.4800319488817895, + "grad_norm": 0.05563808232545853, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8113 + }, + { + "epoch": 6.480830670926517, + "grad_norm": 0.07036174833774567, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8114 + }, + { + "epoch": 6.481629392971246, + "grad_norm": 0.0452839694917202, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8115 + }, + { + "epoch": 6.482428115015974, + "grad_norm": 0.13880759477615356, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8116 + }, + { + "epoch": 6.483226837060703, + "grad_norm": 0.03902722895145416, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8117 + }, + { + "epoch": 6.484025559105431, + "grad_norm": 0.08136945217847824, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8118 + }, + { + "epoch": 6.48482428115016, + "grad_norm": 0.09874774515628815, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8119 + }, + { + "epoch": 6.485623003194888, + "grad_norm": 0.06836161017417908, + "learning_rate": 0.0005, + "loss": 1.0457, + "step": 8120 + }, + { + "epoch": 6.486421725239617, + "grad_norm": 0.1439940482378006, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8121 + }, + { + "epoch": 6.487220447284345, + "grad_norm": 0.0924125388264656, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8122 + }, + { + "epoch": 6.488019169329074, + "grad_norm": 0.06811019778251648, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8123 + }, + { + "epoch": 6.488817891373802, + "grad_norm": 0.1259799599647522, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8124 + }, + { + "epoch": 6.48961661341853, + "grad_norm": 0.1088009849190712, + "learning_rate": 0.0005, + "loss": 1.05, + "step": 8125 + }, + { + "epoch": 6.4904153354632586, + "grad_norm": 0.27054721117019653, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8126 + }, + { + "epoch": 6.491214057507987, + "grad_norm": 0.09674181789159775, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8127 + }, + { + "epoch": 6.492012779552716, + "grad_norm": 0.15491390228271484, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8128 + }, + { + "epoch": 6.492811501597444, + "grad_norm": 0.08790267258882523, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8129 + }, + { + "epoch": 6.493610223642173, + "grad_norm": 0.19372408092021942, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8130 + }, + { + "epoch": 6.494408945686901, + "grad_norm": 0.14786171913146973, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8131 + }, + { + "epoch": 6.49520766773163, + "grad_norm": 0.09591338783502579, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8132 + }, + { + "epoch": 6.496006389776358, + "grad_norm": 0.1810663491487503, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8133 + }, + { + "epoch": 6.496805111821086, + "grad_norm": 0.19754691421985626, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8134 + }, + { + "epoch": 6.497603833865814, + "grad_norm": 0.14094877243041992, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8135 + }, + { + "epoch": 6.498402555910543, + "grad_norm": 0.0782506987452507, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8136 + }, + { + "epoch": 6.4992012779552715, + "grad_norm": 0.19543413817882538, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8137 + }, + { + "epoch": 6.5, + "grad_norm": 0.3102439045906067, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8138 + }, + { + "epoch": 6.5007987220447285, + "grad_norm": 0.13952040672302246, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8139 + }, + { + "epoch": 6.501597444089457, + "grad_norm": 0.1902403086423874, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8140 + }, + { + "epoch": 6.502396166134186, + "grad_norm": 0.2608654499053955, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8141 + }, + { + "epoch": 6.503194888178914, + "grad_norm": 0.22480152547359467, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8142 + }, + { + "epoch": 6.503993610223642, + "grad_norm": 0.21580660343170166, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8143 + }, + { + "epoch": 6.50479233226837, + "grad_norm": 0.1991831213235855, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 8144 + }, + { + "epoch": 6.505591054313099, + "grad_norm": 0.25885632634162903, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8145 + }, + { + "epoch": 6.506389776357827, + "grad_norm": 0.2533574104309082, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8146 + }, + { + "epoch": 6.507188498402556, + "grad_norm": 0.11494381725788116, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8147 + }, + { + "epoch": 6.507987220447284, + "grad_norm": 0.1361113339662552, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8148 + }, + { + "epoch": 6.508785942492013, + "grad_norm": 0.22099947929382324, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8149 + }, + { + "epoch": 6.5095846645367414, + "grad_norm": 0.13223077356815338, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8150 + }, + { + "epoch": 6.51038338658147, + "grad_norm": 0.18203037977218628, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8151 + }, + { + "epoch": 6.511182108626198, + "grad_norm": 0.18066702783107758, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8152 + }, + { + "epoch": 6.511980830670926, + "grad_norm": 0.09984144568443298, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8153 + }, + { + "epoch": 6.512779552715655, + "grad_norm": 0.12803718447685242, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8154 + }, + { + "epoch": 6.513578274760383, + "grad_norm": 0.19731956720352173, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8155 + }, + { + "epoch": 6.514376996805112, + "grad_norm": 0.10687378793954849, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8156 + }, + { + "epoch": 6.51517571884984, + "grad_norm": 0.0971442237496376, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8157 + }, + { + "epoch": 6.515974440894569, + "grad_norm": 0.12840867042541504, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8158 + }, + { + "epoch": 6.516773162939297, + "grad_norm": 0.1245417669415474, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8159 + }, + { + "epoch": 6.517571884984026, + "grad_norm": 0.16850991547107697, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 8160 + }, + { + "epoch": 6.518370607028754, + "grad_norm": 0.1931404322385788, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8161 + }, + { + "epoch": 6.519169329073483, + "grad_norm": 0.08180713653564453, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8162 + }, + { + "epoch": 6.5199680511182105, + "grad_norm": 0.24530328810214996, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8163 + }, + { + "epoch": 6.520766773162939, + "grad_norm": 0.14107894897460938, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8164 + }, + { + "epoch": 6.521565495207668, + "grad_norm": 0.07984111458063126, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8165 + }, + { + "epoch": 6.522364217252396, + "grad_norm": 0.20894968509674072, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8166 + }, + { + "epoch": 6.523162939297125, + "grad_norm": 0.09663927555084229, + "learning_rate": 0.0005, + "loss": 1.0451, + "step": 8167 + }, + { + "epoch": 6.523961661341853, + "grad_norm": 0.0913434773683548, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8168 + }, + { + "epoch": 6.524760383386582, + "grad_norm": 0.1247463971376419, + "learning_rate": 0.0005, + "loss": 1.041, + "step": 8169 + }, + { + "epoch": 6.52555910543131, + "grad_norm": 0.06504802405834198, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8170 + }, + { + "epoch": 6.526357827476039, + "grad_norm": 0.10900555551052094, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8171 + }, + { + "epoch": 6.527156549520766, + "grad_norm": 0.047379642724990845, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8172 + }, + { + "epoch": 6.527955271565495, + "grad_norm": 0.17822134494781494, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8173 + }, + { + "epoch": 6.5287539936102235, + "grad_norm": 0.07658754289150238, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8174 + }, + { + "epoch": 6.529552715654952, + "grad_norm": 0.17294292151927948, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8175 + }, + { + "epoch": 6.5303514376996805, + "grad_norm": 0.07095851004123688, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8176 + }, + { + "epoch": 6.531150159744409, + "grad_norm": 0.07328472286462784, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 8177 + }, + { + "epoch": 6.531948881789138, + "grad_norm": 0.11216691881418228, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8178 + }, + { + "epoch": 6.532747603833866, + "grad_norm": 0.3007374703884125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8179 + }, + { + "epoch": 6.533546325878595, + "grad_norm": 0.06059226021170616, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8180 + }, + { + "epoch": 6.534345047923322, + "grad_norm": 0.14438967406749725, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8181 + }, + { + "epoch": 6.535143769968051, + "grad_norm": 0.1965394914150238, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8182 + }, + { + "epoch": 6.535942492012779, + "grad_norm": 0.130478173494339, + "learning_rate": 0.0005, + "loss": 1.0479, + "step": 8183 + }, + { + "epoch": 6.536741214057508, + "grad_norm": 0.16713190078735352, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8184 + }, + { + "epoch": 6.537539936102236, + "grad_norm": 0.18644076585769653, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8185 + }, + { + "epoch": 6.538338658146965, + "grad_norm": 0.06685839593410492, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8186 + }, + { + "epoch": 6.539137380191693, + "grad_norm": 0.17819803953170776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8187 + }, + { + "epoch": 6.539936102236422, + "grad_norm": 0.5894746780395508, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8188 + }, + { + "epoch": 6.5407348242811505, + "grad_norm": 0.088719442486763, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8189 + }, + { + "epoch": 6.541533546325878, + "grad_norm": 0.1336045265197754, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8190 + }, + { + "epoch": 6.542332268370607, + "grad_norm": 0.12859520316123962, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8191 + }, + { + "epoch": 6.543130990415335, + "grad_norm": 0.13402487337589264, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8192 + }, + { + "epoch": 6.543929712460064, + "grad_norm": 0.11415290832519531, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8193 + }, + { + "epoch": 6.544728434504792, + "grad_norm": 0.1775715947151184, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8194 + }, + { + "epoch": 6.545527156549521, + "grad_norm": 0.6331294775009155, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8195 + }, + { + "epoch": 6.546325878594249, + "grad_norm": 0.09323445707559586, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8196 + }, + { + "epoch": 6.547124600638978, + "grad_norm": 0.1761421412229538, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8197 + }, + { + "epoch": 6.547923322683706, + "grad_norm": 0.09608824551105499, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8198 + }, + { + "epoch": 6.548722044728435, + "grad_norm": 0.07564207166433334, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8199 + }, + { + "epoch": 6.549520766773163, + "grad_norm": 0.08033318817615509, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8200 + }, + { + "epoch": 6.550319488817891, + "grad_norm": 0.13604776561260223, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8201 + }, + { + "epoch": 6.55111821086262, + "grad_norm": 0.1046299859881401, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8202 + }, + { + "epoch": 6.551916932907348, + "grad_norm": 0.23783712089061737, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8203 + }, + { + "epoch": 6.552715654952077, + "grad_norm": 0.07360750436782837, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8204 + }, + { + "epoch": 6.553514376996805, + "grad_norm": 0.07213526219129562, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8205 + }, + { + "epoch": 6.554313099041534, + "grad_norm": 0.12431066483259201, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8206 + }, + { + "epoch": 6.555111821086262, + "grad_norm": 0.09665104001760483, + "learning_rate": 0.0005, + "loss": 1.0432, + "step": 8207 + }, + { + "epoch": 6.555910543130991, + "grad_norm": 0.22090987861156464, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8208 + }, + { + "epoch": 6.556709265175719, + "grad_norm": 0.14936690032482147, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8209 + }, + { + "epoch": 6.557507987220447, + "grad_norm": 0.09804648160934448, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 8210 + }, + { + "epoch": 6.5583067092651754, + "grad_norm": 0.07829400897026062, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8211 + }, + { + "epoch": 6.559105431309904, + "grad_norm": 0.08218041807413101, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8212 + }, + { + "epoch": 6.5599041533546325, + "grad_norm": 0.08018422871828079, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8213 + }, + { + "epoch": 6.560702875399361, + "grad_norm": 0.07790627330541611, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8214 + }, + { + "epoch": 6.56150159744409, + "grad_norm": 0.12526501715183258, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 8215 + }, + { + "epoch": 6.562300319488818, + "grad_norm": 0.15222279727458954, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8216 + }, + { + "epoch": 6.563099041533547, + "grad_norm": 0.19605369865894318, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8217 + }, + { + "epoch": 6.563897763578275, + "grad_norm": 1.4426831007003784, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8218 + }, + { + "epoch": 6.564696485623003, + "grad_norm": 0.184299498796463, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8219 + }, + { + "epoch": 6.565495207667731, + "grad_norm": 0.12029392272233963, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8220 + }, + { + "epoch": 6.56629392971246, + "grad_norm": 0.07442726939916611, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8221 + }, + { + "epoch": 6.567092651757188, + "grad_norm": 0.14331156015396118, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8222 + }, + { + "epoch": 6.567891373801917, + "grad_norm": 0.11202000081539154, + "learning_rate": 0.0005, + "loss": 1.0429, + "step": 8223 + }, + { + "epoch": 6.568690095846645, + "grad_norm": 0.10699515789747238, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8224 + }, + { + "epoch": 6.569488817891374, + "grad_norm": 0.07708705961704254, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8225 + }, + { + "epoch": 6.5702875399361025, + "grad_norm": 0.08026644587516785, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8226 + }, + { + "epoch": 6.571086261980831, + "grad_norm": 0.08694002777338028, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8227 + }, + { + "epoch": 6.571884984025559, + "grad_norm": 0.11824248731136322, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8228 + }, + { + "epoch": 6.572683706070287, + "grad_norm": 0.06505008041858673, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8229 + }, + { + "epoch": 6.573482428115016, + "grad_norm": 0.05341152846813202, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8230 + }, + { + "epoch": 6.574281150159744, + "grad_norm": 0.09604120999574661, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8231 + }, + { + "epoch": 6.575079872204473, + "grad_norm": 0.08336330950260162, + "learning_rate": 0.0005, + "loss": 1.0485, + "step": 8232 + }, + { + "epoch": 6.575878594249201, + "grad_norm": 0.06368359923362732, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8233 + }, + { + "epoch": 6.57667731629393, + "grad_norm": 0.13115698099136353, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8234 + }, + { + "epoch": 6.577476038338658, + "grad_norm": 0.08847527951002121, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8235 + }, + { + "epoch": 6.578274760383387, + "grad_norm": 0.0458359532058239, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8236 + }, + { + "epoch": 6.5790734824281145, + "grad_norm": 0.10106709599494934, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8237 + }, + { + "epoch": 6.579872204472844, + "grad_norm": 0.06641486287117004, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8238 + }, + { + "epoch": 6.580670926517572, + "grad_norm": 0.0733480304479599, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8239 + }, + { + "epoch": 6.5814696485623, + "grad_norm": 0.07835566252470016, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8240 + }, + { + "epoch": 6.582268370607029, + "grad_norm": 0.13473013043403625, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8241 + }, + { + "epoch": 6.583067092651757, + "grad_norm": 0.062259674072265625, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8242 + }, + { + "epoch": 6.583865814696486, + "grad_norm": 0.05236242339015007, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8243 + }, + { + "epoch": 6.584664536741214, + "grad_norm": 0.08255355805158615, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 8244 + }, + { + "epoch": 6.585463258785943, + "grad_norm": 0.1182556301355362, + "learning_rate": 0.0005, + "loss": 1.0454, + "step": 8245 + }, + { + "epoch": 6.586261980830671, + "grad_norm": 0.0555981881916523, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8246 + }, + { + "epoch": 6.5870607028754, + "grad_norm": 0.09490877389907837, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8247 + }, + { + "epoch": 6.587859424920127, + "grad_norm": 0.6106880903244019, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8248 + }, + { + "epoch": 6.588658146964856, + "grad_norm": 0.0474761538207531, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8249 + }, + { + "epoch": 6.5894568690095845, + "grad_norm": 0.1429997831583023, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8250 + }, + { + "epoch": 6.590255591054313, + "grad_norm": 0.0815487951040268, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8251 + }, + { + "epoch": 6.5910543130990416, + "grad_norm": 0.096903957426548, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8252 + }, + { + "epoch": 6.59185303514377, + "grad_norm": 0.17775478959083557, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8253 + }, + { + "epoch": 6.592651757188499, + "grad_norm": 0.11637275665998459, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8254 + }, + { + "epoch": 6.593450479233227, + "grad_norm": 0.08475788682699203, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8255 + }, + { + "epoch": 6.594249201277956, + "grad_norm": 0.1786298304796219, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8256 + }, + { + "epoch": 6.595047923322683, + "grad_norm": 0.12316745519638062, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8257 + }, + { + "epoch": 6.595846645367412, + "grad_norm": 0.5367861986160278, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8258 + }, + { + "epoch": 6.59664536741214, + "grad_norm": 0.2289825677871704, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8259 + }, + { + "epoch": 6.597444089456869, + "grad_norm": 0.17333106696605682, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8260 + }, + { + "epoch": 6.598242811501597, + "grad_norm": 0.10858172923326492, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8261 + }, + { + "epoch": 6.599041533546326, + "grad_norm": 0.2013384997844696, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 8262 + }, + { + "epoch": 6.5998402555910545, + "grad_norm": 0.13658639788627625, + "learning_rate": 0.0005, + "loss": 1.045, + "step": 8263 + }, + { + "epoch": 6.600638977635783, + "grad_norm": 0.12755805253982544, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8264 + }, + { + "epoch": 6.6014376996805115, + "grad_norm": 0.18299050629138947, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8265 + }, + { + "epoch": 6.602236421725239, + "grad_norm": 0.07105828821659088, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8266 + }, + { + "epoch": 6.603035143769968, + "grad_norm": 0.13049830496311188, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8267 + }, + { + "epoch": 6.603833865814696, + "grad_norm": 0.16121532022953033, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8268 + }, + { + "epoch": 6.604632587859425, + "grad_norm": 0.07512015104293823, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8269 + }, + { + "epoch": 6.605431309904153, + "grad_norm": 0.17407254874706268, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8270 + }, + { + "epoch": 6.606230031948882, + "grad_norm": 0.11297854781150818, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8271 + }, + { + "epoch": 6.60702875399361, + "grad_norm": 0.2839175760746002, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8272 + }, + { + "epoch": 6.607827476038339, + "grad_norm": 0.07847599685192108, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8273 + }, + { + "epoch": 6.608626198083067, + "grad_norm": 0.08995212614536285, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8274 + }, + { + "epoch": 6.609424920127795, + "grad_norm": 0.07382770627737045, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8275 + }, + { + "epoch": 6.6102236421725244, + "grad_norm": 0.06170637533068657, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8276 + }, + { + "epoch": 6.611022364217252, + "grad_norm": 0.07311394810676575, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8277 + }, + { + "epoch": 6.611821086261981, + "grad_norm": 0.06827707588672638, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8278 + }, + { + "epoch": 6.612619808306709, + "grad_norm": 0.05261022970080376, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8279 + }, + { + "epoch": 6.613418530351438, + "grad_norm": 0.11326271295547485, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8280 + }, + { + "epoch": 6.614217252396166, + "grad_norm": 0.1652819961309433, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8281 + }, + { + "epoch": 6.615015974440895, + "grad_norm": 0.10749676078557968, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8282 + }, + { + "epoch": 6.615814696485623, + "grad_norm": 0.20359984040260315, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8283 + }, + { + "epoch": 6.616613418530352, + "grad_norm": 0.18771138787269592, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8284 + }, + { + "epoch": 6.61741214057508, + "grad_norm": 2.5382773876190186, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8285 + }, + { + "epoch": 6.618210862619808, + "grad_norm": 0.30566683411598206, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8286 + }, + { + "epoch": 6.6190095846645365, + "grad_norm": 0.3638366758823395, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8287 + }, + { + "epoch": 6.619808306709265, + "grad_norm": 0.10939022153615952, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8288 + }, + { + "epoch": 6.6206070287539935, + "grad_norm": 0.3243744969367981, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8289 + }, + { + "epoch": 6.621405750798722, + "grad_norm": 0.2703976333141327, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8290 + }, + { + "epoch": 6.622204472843451, + "grad_norm": 0.06998306512832642, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8291 + }, + { + "epoch": 6.623003194888179, + "grad_norm": 0.25409170985221863, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8292 + }, + { + "epoch": 6.623801916932908, + "grad_norm": 0.110246442258358, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8293 + }, + { + "epoch": 6.624600638977636, + "grad_norm": 0.1667647659778595, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8294 + }, + { + "epoch": 6.625399361022364, + "grad_norm": 0.17452718317508698, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8295 + }, + { + "epoch": 6.626198083067092, + "grad_norm": 0.11691702157258987, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 8296 + }, + { + "epoch": 6.626996805111821, + "grad_norm": 0.14679500460624695, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8297 + }, + { + "epoch": 6.627795527156549, + "grad_norm": 0.06978808343410492, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8298 + }, + { + "epoch": 6.628594249201278, + "grad_norm": 0.36758533120155334, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8299 + }, + { + "epoch": 6.6293929712460065, + "grad_norm": 0.11101481318473816, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8300 + }, + { + "epoch": 6.630191693290735, + "grad_norm": 0.11762239784002304, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8301 + }, + { + "epoch": 6.6309904153354635, + "grad_norm": 0.11467000097036362, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8302 + }, + { + "epoch": 6.631789137380192, + "grad_norm": 0.14236292243003845, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8303 + }, + { + "epoch": 6.63258785942492, + "grad_norm": 0.050860557705163956, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8304 + }, + { + "epoch": 6.633386581469648, + "grad_norm": 0.07763084024190903, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 8305 + }, + { + "epoch": 6.634185303514377, + "grad_norm": 0.06728993356227875, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8306 + }, + { + "epoch": 6.634984025559105, + "grad_norm": 0.06984454393386841, + "learning_rate": 0.0005, + "loss": 1.0478, + "step": 8307 + }, + { + "epoch": 6.635782747603834, + "grad_norm": 0.09839699417352676, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8308 + }, + { + "epoch": 6.636581469648562, + "grad_norm": 0.1262810379266739, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8309 + }, + { + "epoch": 6.637380191693291, + "grad_norm": 0.08147390931844711, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8310 + }, + { + "epoch": 6.638178913738019, + "grad_norm": 0.11567803472280502, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8311 + }, + { + "epoch": 6.638977635782748, + "grad_norm": 0.14972445368766785, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8312 + }, + { + "epoch": 6.6397763578274756, + "grad_norm": 0.2970331609249115, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8313 + }, + { + "epoch": 6.640575079872205, + "grad_norm": 0.05576174706220627, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8314 + }, + { + "epoch": 6.641373801916933, + "grad_norm": 0.048716023564338684, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8315 + }, + { + "epoch": 6.642172523961661, + "grad_norm": 0.05986058712005615, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8316 + }, + { + "epoch": 6.64297124600639, + "grad_norm": 0.07985493540763855, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8317 + }, + { + "epoch": 6.643769968051118, + "grad_norm": 0.5361261963844299, + "learning_rate": 0.0005, + "loss": 1.0462, + "step": 8318 + }, + { + "epoch": 6.644568690095847, + "grad_norm": 0.15383858978748322, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8319 + }, + { + "epoch": 6.645367412140575, + "grad_norm": 0.17428068816661835, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8320 + }, + { + "epoch": 6.646166134185304, + "grad_norm": 0.09801791608333588, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8321 + }, + { + "epoch": 6.646964856230032, + "grad_norm": 0.11805883049964905, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8322 + }, + { + "epoch": 6.647763578274761, + "grad_norm": 0.13135986030101776, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8323 + }, + { + "epoch": 6.6485623003194885, + "grad_norm": 0.10351908206939697, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8324 + }, + { + "epoch": 6.649361022364217, + "grad_norm": 0.11086217314004898, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8325 + }, + { + "epoch": 6.6501597444089455, + "grad_norm": 0.1173853799700737, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8326 + }, + { + "epoch": 6.650958466453674, + "grad_norm": 0.10743618756532669, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8327 + }, + { + "epoch": 6.651757188498403, + "grad_norm": 0.5378667116165161, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8328 + }, + { + "epoch": 6.652555910543131, + "grad_norm": 0.5077546834945679, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8329 + }, + { + "epoch": 6.65335463258786, + "grad_norm": 0.21998530626296997, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8330 + }, + { + "epoch": 6.654153354632588, + "grad_norm": 0.1235295757651329, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8331 + }, + { + "epoch": 6.654952076677317, + "grad_norm": 0.7328196167945862, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 8332 + }, + { + "epoch": 6.655750798722044, + "grad_norm": 0.12249958515167236, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8333 + }, + { + "epoch": 6.656549520766773, + "grad_norm": 0.12837325036525726, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8334 + }, + { + "epoch": 6.657348242811501, + "grad_norm": 0.09456688165664673, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8335 + }, + { + "epoch": 6.65814696485623, + "grad_norm": 0.13044698536396027, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8336 + }, + { + "epoch": 6.6589456869009584, + "grad_norm": 0.13105876743793488, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8337 + }, + { + "epoch": 6.659744408945687, + "grad_norm": 0.14498500525951385, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8338 + }, + { + "epoch": 6.6605431309904155, + "grad_norm": 0.08840721845626831, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8339 + }, + { + "epoch": 6.661341853035144, + "grad_norm": 1.276719570159912, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8340 + }, + { + "epoch": 6.662140575079873, + "grad_norm": 0.36189836263656616, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 8341 + }, + { + "epoch": 6.6629392971246, + "grad_norm": 0.6304068565368652, + "learning_rate": 0.0005, + "loss": 1.0792, + "step": 8342 + }, + { + "epoch": 6.663738019169329, + "grad_norm": 0.524870753288269, + "learning_rate": 0.0005, + "loss": 1.0797, + "step": 8343 + }, + { + "epoch": 6.664536741214057, + "grad_norm": 0.14638005197048187, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8344 + }, + { + "epoch": 6.665335463258786, + "grad_norm": 0.3090416491031647, + "learning_rate": 0.0005, + "loss": 1.0736, + "step": 8345 + }, + { + "epoch": 6.666134185303514, + "grad_norm": 0.1549086570739746, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8346 + }, + { + "epoch": 6.666932907348243, + "grad_norm": 0.36996960639953613, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8347 + }, + { + "epoch": 6.667731629392971, + "grad_norm": 0.4879205524921417, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 8348 + }, + { + "epoch": 6.6685303514377, + "grad_norm": 0.6129382848739624, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8349 + }, + { + "epoch": 6.669329073482428, + "grad_norm": 0.37913191318511963, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8350 + }, + { + "epoch": 6.670127795527156, + "grad_norm": 0.1678311973810196, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 8351 + }, + { + "epoch": 6.6709265175718855, + "grad_norm": 0.17131182551383972, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8352 + }, + { + "epoch": 6.671725239616613, + "grad_norm": 0.29875028133392334, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8353 + }, + { + "epoch": 6.672523961661342, + "grad_norm": 0.5288842916488647, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8354 + }, + { + "epoch": 6.67332268370607, + "grad_norm": 0.24637238681316376, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8355 + }, + { + "epoch": 6.674121405750799, + "grad_norm": 0.25089535117149353, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8356 + }, + { + "epoch": 6.674920127795527, + "grad_norm": 0.5517246723175049, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8357 + }, + { + "epoch": 6.675718849840256, + "grad_norm": 0.07291965931653976, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8358 + }, + { + "epoch": 6.676517571884984, + "grad_norm": 0.2561021149158478, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8359 + }, + { + "epoch": 6.677316293929713, + "grad_norm": 0.2184453308582306, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8360 + }, + { + "epoch": 6.678115015974441, + "grad_norm": 0.10715393722057343, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8361 + }, + { + "epoch": 6.678913738019169, + "grad_norm": 0.16824330389499664, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8362 + }, + { + "epoch": 6.6797124600638975, + "grad_norm": 0.22539092600345612, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8363 + }, + { + "epoch": 6.680511182108626, + "grad_norm": 0.11956257373094559, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8364 + }, + { + "epoch": 6.681309904153355, + "grad_norm": 0.2023434042930603, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8365 + }, + { + "epoch": 6.682108626198083, + "grad_norm": 0.26878416538238525, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8366 + }, + { + "epoch": 6.682907348242812, + "grad_norm": 0.11318770796060562, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8367 + }, + { + "epoch": 6.68370607028754, + "grad_norm": 0.29282090067863464, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 8368 + }, + { + "epoch": 6.684504792332269, + "grad_norm": 0.23825445771217346, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8369 + }, + { + "epoch": 6.685303514376997, + "grad_norm": 0.27186012268066406, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 8370 + }, + { + "epoch": 6.686102236421725, + "grad_norm": 0.28540825843811035, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8371 + }, + { + "epoch": 6.686900958466453, + "grad_norm": 0.14273707568645477, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8372 + }, + { + "epoch": 6.687699680511182, + "grad_norm": 0.3684747815132141, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8373 + }, + { + "epoch": 6.68849840255591, + "grad_norm": 0.23812046647071838, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8374 + }, + { + "epoch": 6.689297124600639, + "grad_norm": 0.15459395945072174, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8375 + }, + { + "epoch": 6.6900958466453675, + "grad_norm": 0.28762584924697876, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8376 + }, + { + "epoch": 6.690894568690096, + "grad_norm": 0.16686615347862244, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8377 + }, + { + "epoch": 6.6916932907348246, + "grad_norm": 0.16456246376037598, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8378 + }, + { + "epoch": 6.692492012779553, + "grad_norm": 0.2991560399532318, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8379 + }, + { + "epoch": 6.693290734824281, + "grad_norm": 0.14811092615127563, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8380 + }, + { + "epoch": 6.694089456869009, + "grad_norm": 0.14380809664726257, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8381 + }, + { + "epoch": 6.694888178913738, + "grad_norm": 0.0801207646727562, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8382 + }, + { + "epoch": 6.695686900958466, + "grad_norm": 0.08404620736837387, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8383 + }, + { + "epoch": 6.696485623003195, + "grad_norm": 0.1137305274605751, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8384 + }, + { + "epoch": 6.697284345047923, + "grad_norm": 0.08207721263170242, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8385 + }, + { + "epoch": 6.698083067092652, + "grad_norm": 0.09234748780727386, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8386 + }, + { + "epoch": 6.69888178913738, + "grad_norm": 0.29589149355888367, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8387 + }, + { + "epoch": 6.699680511182109, + "grad_norm": 0.2142077386379242, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8388 + }, + { + "epoch": 6.700479233226837, + "grad_norm": 0.10343299061059952, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8389 + }, + { + "epoch": 6.701277955271565, + "grad_norm": 0.12988241016864777, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8390 + }, + { + "epoch": 6.702076677316294, + "grad_norm": 0.20497195422649384, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 8391 + }, + { + "epoch": 6.702875399361022, + "grad_norm": 0.10697030276060104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8392 + }, + { + "epoch": 6.703674121405751, + "grad_norm": 0.1844921112060547, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8393 + }, + { + "epoch": 6.704472843450479, + "grad_norm": 0.13283176720142365, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8394 + }, + { + "epoch": 6.705271565495208, + "grad_norm": 0.14544987678527832, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8395 + }, + { + "epoch": 6.706070287539936, + "grad_norm": 0.10253588855266571, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8396 + }, + { + "epoch": 6.706869009584665, + "grad_norm": 0.11183217167854309, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8397 + }, + { + "epoch": 6.707667731629393, + "grad_norm": 0.12705212831497192, + "learning_rate": 0.0005, + "loss": 1.0643, + "step": 8398 + }, + { + "epoch": 6.708466453674122, + "grad_norm": 0.08835884928703308, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8399 + }, + { + "epoch": 6.7092651757188495, + "grad_norm": 0.22377537190914154, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8400 + }, + { + "epoch": 6.710063897763578, + "grad_norm": 0.7205986976623535, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8401 + }, + { + "epoch": 6.710862619808307, + "grad_norm": 0.07383892685174942, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8402 + }, + { + "epoch": 6.711661341853035, + "grad_norm": 0.11109078675508499, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8403 + }, + { + "epoch": 6.712460063897764, + "grad_norm": 0.10979527235031128, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8404 + }, + { + "epoch": 6.713258785942492, + "grad_norm": 0.062491416931152344, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8405 + }, + { + "epoch": 6.714057507987221, + "grad_norm": 0.11196211725473404, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8406 + }, + { + "epoch": 6.714856230031949, + "grad_norm": 0.07815852016210556, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8407 + }, + { + "epoch": 6.715654952076678, + "grad_norm": 3.9684712886810303, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8408 + }, + { + "epoch": 6.716453674121405, + "grad_norm": 0.11982189118862152, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8409 + }, + { + "epoch": 6.717252396166134, + "grad_norm": 0.22319400310516357, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8410 + }, + { + "epoch": 6.718051118210862, + "grad_norm": 0.0937948003411293, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8411 + }, + { + "epoch": 6.718849840255591, + "grad_norm": 0.09193865954875946, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8412 + }, + { + "epoch": 6.7196485623003195, + "grad_norm": 0.08838166296482086, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8413 + }, + { + "epoch": 6.720447284345048, + "grad_norm": 0.0960271805524826, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8414 + }, + { + "epoch": 6.7212460063897765, + "grad_norm": 0.07488188147544861, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8415 + }, + { + "epoch": 6.722044728434505, + "grad_norm": 0.08563253283500671, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 8416 + }, + { + "epoch": 6.722843450479234, + "grad_norm": 0.16766750812530518, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8417 + }, + { + "epoch": 6.723642172523961, + "grad_norm": 0.12811559438705444, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8418 + }, + { + "epoch": 6.72444089456869, + "grad_norm": 0.12410838901996613, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8419 + }, + { + "epoch": 6.725239616613418, + "grad_norm": 0.1354755014181137, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8420 + }, + { + "epoch": 6.726038338658147, + "grad_norm": 0.17771920561790466, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8421 + }, + { + "epoch": 6.726837060702875, + "grad_norm": 0.19576571881771088, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8422 + }, + { + "epoch": 6.727635782747604, + "grad_norm": 0.5415527820587158, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 8423 + }, + { + "epoch": 6.728434504792332, + "grad_norm": 0.6647717952728271, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8424 + }, + { + "epoch": 6.729233226837061, + "grad_norm": 0.16329380869865417, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8425 + }, + { + "epoch": 6.7300319488817895, + "grad_norm": 0.4046335518360138, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8426 + }, + { + "epoch": 6.730830670926517, + "grad_norm": 0.1817079335451126, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8427 + }, + { + "epoch": 6.731629392971246, + "grad_norm": 0.3438379466533661, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 8428 + }, + { + "epoch": 6.732428115015974, + "grad_norm": 0.48276495933532715, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8429 + }, + { + "epoch": 6.733226837060703, + "grad_norm": 0.4002913236618042, + "learning_rate": 0.0005, + "loss": 1.0731, + "step": 8430 + }, + { + "epoch": 6.734025559105431, + "grad_norm": 0.37833303213119507, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8431 + }, + { + "epoch": 6.73482428115016, + "grad_norm": 0.26374873518943787, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8432 + }, + { + "epoch": 6.735623003194888, + "grad_norm": 0.19766554236412048, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8433 + }, + { + "epoch": 6.736421725239617, + "grad_norm": 0.1996731013059616, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8434 + }, + { + "epoch": 6.737220447284345, + "grad_norm": 0.19733403623104095, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8435 + }, + { + "epoch": 6.738019169329074, + "grad_norm": 0.24423246085643768, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8436 + }, + { + "epoch": 6.738817891373802, + "grad_norm": 0.4329655170440674, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8437 + }, + { + "epoch": 6.73961661341853, + "grad_norm": 0.6964716911315918, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8438 + }, + { + "epoch": 6.7404153354632586, + "grad_norm": 0.12961135804653168, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8439 + }, + { + "epoch": 6.741214057507987, + "grad_norm": 0.2783071994781494, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 8440 + }, + { + "epoch": 6.742012779552716, + "grad_norm": 0.3446369767189026, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8441 + }, + { + "epoch": 6.742811501597444, + "grad_norm": 0.22592051327228546, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8442 + }, + { + "epoch": 6.743610223642173, + "grad_norm": 0.06710102409124374, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8443 + }, + { + "epoch": 6.744408945686901, + "grad_norm": 0.2268608957529068, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8444 + }, + { + "epoch": 6.74520766773163, + "grad_norm": 0.08200005441904068, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8445 + }, + { + "epoch": 6.746006389776358, + "grad_norm": 0.2357168197631836, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8446 + }, + { + "epoch": 6.746805111821086, + "grad_norm": 0.20047837495803833, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8447 + }, + { + "epoch": 6.747603833865814, + "grad_norm": 0.2309340387582779, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8448 + }, + { + "epoch": 6.748402555910543, + "grad_norm": 0.11635745316743851, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8449 + }, + { + "epoch": 6.7492012779552715, + "grad_norm": 0.4076550602912903, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8450 + }, + { + "epoch": 6.75, + "grad_norm": 0.3500226140022278, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8451 + }, + { + "epoch": 6.7507987220447285, + "grad_norm": 0.2993873357772827, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8452 + }, + { + "epoch": 6.751597444089457, + "grad_norm": 0.1099642813205719, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8453 + }, + { + "epoch": 6.752396166134186, + "grad_norm": 0.17455045878887177, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 8454 + }, + { + "epoch": 6.753194888178914, + "grad_norm": 0.12831585109233856, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8455 + }, + { + "epoch": 6.753993610223642, + "grad_norm": 0.1048964336514473, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 8456 + }, + { + "epoch": 6.75479233226837, + "grad_norm": 0.16713464260101318, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 8457 + }, + { + "epoch": 6.755591054313099, + "grad_norm": 0.07837880402803421, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8458 + }, + { + "epoch": 6.756389776357827, + "grad_norm": 0.17375724017620087, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 8459 + }, + { + "epoch": 6.757188498402556, + "grad_norm": 0.9700595140457153, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8460 + }, + { + "epoch": 6.757987220447284, + "grad_norm": 0.23614056408405304, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8461 + }, + { + "epoch": 6.758785942492013, + "grad_norm": 0.2536165416240692, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 8462 + }, + { + "epoch": 6.7595846645367414, + "grad_norm": 0.26688873767852783, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 8463 + }, + { + "epoch": 6.76038338658147, + "grad_norm": 0.3807159662246704, + "learning_rate": 0.0005, + "loss": 1.0671, + "step": 8464 + }, + { + "epoch": 6.761182108626198, + "grad_norm": 0.2132156789302826, + "learning_rate": 0.0005, + "loss": 1.0703, + "step": 8465 + }, + { + "epoch": 6.761980830670926, + "grad_norm": 0.19821512699127197, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 8466 + }, + { + "epoch": 6.762779552715655, + "grad_norm": 0.23694948852062225, + "learning_rate": 0.0005, + "loss": 1.0805, + "step": 8467 + }, + { + "epoch": 6.763578274760383, + "grad_norm": 0.1524704396724701, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 8468 + }, + { + "epoch": 6.764376996805112, + "grad_norm": 0.26719930768013, + "learning_rate": 0.0005, + "loss": 1.0683, + "step": 8469 + }, + { + "epoch": 6.76517571884984, + "grad_norm": 0.12077363580465317, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8470 + }, + { + "epoch": 6.765974440894569, + "grad_norm": 0.14398355782032013, + "learning_rate": 0.0005, + "loss": 1.0667, + "step": 8471 + }, + { + "epoch": 6.766773162939297, + "grad_norm": 0.1972649097442627, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8472 + }, + { + "epoch": 6.767571884984026, + "grad_norm": 0.10172676295042038, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8473 + }, + { + "epoch": 6.768370607028754, + "grad_norm": 0.10743385553359985, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8474 + }, + { + "epoch": 6.769169329073483, + "grad_norm": 0.06148320063948631, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8475 + }, + { + "epoch": 6.7699680511182105, + "grad_norm": 0.08771604299545288, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8476 + }, + { + "epoch": 6.770766773162939, + "grad_norm": 0.13444122672080994, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8477 + }, + { + "epoch": 6.771565495207668, + "grad_norm": 0.4677158296108246, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8478 + }, + { + "epoch": 6.772364217252396, + "grad_norm": 0.08972432464361191, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8479 + }, + { + "epoch": 6.773162939297125, + "grad_norm": 0.10502214729785919, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8480 + }, + { + "epoch": 6.773961661341853, + "grad_norm": 0.14014923572540283, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8481 + }, + { + "epoch": 6.774760383386582, + "grad_norm": 0.3244888484477997, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8482 + }, + { + "epoch": 6.77555910543131, + "grad_norm": 0.20495742559432983, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8483 + }, + { + "epoch": 6.776357827476039, + "grad_norm": 0.15609663724899292, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8484 + }, + { + "epoch": 6.777156549520766, + "grad_norm": 0.13948239386081696, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8485 + }, + { + "epoch": 6.777955271565495, + "grad_norm": 0.28558677434921265, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8486 + }, + { + "epoch": 6.7787539936102235, + "grad_norm": 0.1481117457151413, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8487 + }, + { + "epoch": 6.779552715654952, + "grad_norm": 0.31998512148857117, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8488 + }, + { + "epoch": 6.7803514376996805, + "grad_norm": 0.1945921927690506, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 8489 + }, + { + "epoch": 6.781150159744409, + "grad_norm": 18.217361450195312, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8490 + }, + { + "epoch": 6.781948881789138, + "grad_norm": 0.23472756147384644, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 8491 + }, + { + "epoch": 6.782747603833866, + "grad_norm": 0.10026291757822037, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8492 + }, + { + "epoch": 6.783546325878595, + "grad_norm": 0.14418581128120422, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8493 + }, + { + "epoch": 6.784345047923322, + "grad_norm": 0.14439892768859863, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8494 + }, + { + "epoch": 6.785143769968051, + "grad_norm": 0.37140071392059326, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8495 + }, + { + "epoch": 6.785942492012779, + "grad_norm": 0.09995266050100327, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8496 + }, + { + "epoch": 6.786741214057508, + "grad_norm": 0.08430355042219162, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8497 + }, + { + "epoch": 6.787539936102236, + "grad_norm": 0.11121980845928192, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8498 + }, + { + "epoch": 6.788338658146965, + "grad_norm": 0.20520392060279846, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 8499 + }, + { + "epoch": 6.789137380191693, + "grad_norm": 0.10163573920726776, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8500 + }, + { + "epoch": 6.789936102236422, + "grad_norm": 0.12025435268878937, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8501 + }, + { + "epoch": 6.7907348242811505, + "grad_norm": 0.12003593891859055, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8502 + }, + { + "epoch": 6.791533546325878, + "grad_norm": 0.11013154685497284, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8503 + }, + { + "epoch": 6.792332268370607, + "grad_norm": 0.10089465230703354, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8504 + }, + { + "epoch": 6.793130990415335, + "grad_norm": 0.06270314007997513, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8505 + }, + { + "epoch": 6.793929712460064, + "grad_norm": 0.08571597188711166, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8506 + }, + { + "epoch": 6.794728434504792, + "grad_norm": 0.5324975848197937, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8507 + }, + { + "epoch": 6.795527156549521, + "grad_norm": 0.24500170350074768, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8508 + }, + { + "epoch": 6.796325878594249, + "grad_norm": 0.10234003514051437, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8509 + }, + { + "epoch": 6.797124600638978, + "grad_norm": 0.09924131631851196, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8510 + }, + { + "epoch": 6.797923322683706, + "grad_norm": 0.1413181573152542, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8511 + }, + { + "epoch": 6.798722044728435, + "grad_norm": 0.12095441669225693, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8512 + }, + { + "epoch": 6.799520766773163, + "grad_norm": 0.08617071062326431, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8513 + }, + { + "epoch": 6.800319488817891, + "grad_norm": 0.17984576523303986, + "learning_rate": 0.0005, + "loss": 1.0673, + "step": 8514 + }, + { + "epoch": 6.80111821086262, + "grad_norm": 0.16447608172893524, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8515 + }, + { + "epoch": 6.801916932907348, + "grad_norm": 0.15486668050289154, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8516 + }, + { + "epoch": 6.802715654952077, + "grad_norm": 0.10176295787096024, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8517 + }, + { + "epoch": 6.803514376996805, + "grad_norm": 0.14911721646785736, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8518 + }, + { + "epoch": 6.804313099041534, + "grad_norm": 0.11073625087738037, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8519 + }, + { + "epoch": 6.805111821086262, + "grad_norm": 0.10299605876207352, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8520 + }, + { + "epoch": 6.805910543130991, + "grad_norm": 0.189669668674469, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8521 + }, + { + "epoch": 6.806709265175719, + "grad_norm": 0.12226799875497818, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8522 + }, + { + "epoch": 6.807507987220447, + "grad_norm": 0.17778469622135162, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8523 + }, + { + "epoch": 6.8083067092651754, + "grad_norm": 0.16370487213134766, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8524 + }, + { + "epoch": 6.809105431309904, + "grad_norm": 0.05171172693371773, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8525 + }, + { + "epoch": 6.8099041533546325, + "grad_norm": 0.16393537819385529, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8526 + }, + { + "epoch": 6.810702875399361, + "grad_norm": 0.09398743510246277, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8527 + }, + { + "epoch": 6.81150159744409, + "grad_norm": 0.08430743217468262, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8528 + }, + { + "epoch": 6.812300319488818, + "grad_norm": 0.1131691113114357, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 8529 + }, + { + "epoch": 6.813099041533547, + "grad_norm": 0.0907130092382431, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8530 + }, + { + "epoch": 6.813897763578275, + "grad_norm": 0.1460096687078476, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8531 + }, + { + "epoch": 6.814696485623003, + "grad_norm": 0.07953288406133652, + "learning_rate": 0.0005, + "loss": 1.0472, + "step": 8532 + }, + { + "epoch": 6.815495207667731, + "grad_norm": 0.061827294528484344, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 8533 + }, + { + "epoch": 6.81629392971246, + "grad_norm": 0.09172365814447403, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8534 + }, + { + "epoch": 6.817092651757188, + "grad_norm": 0.05858466029167175, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8535 + }, + { + "epoch": 6.817891373801917, + "grad_norm": 0.13774308562278748, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8536 + }, + { + "epoch": 6.818690095846645, + "grad_norm": 0.09840130060911179, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8537 + }, + { + "epoch": 6.819488817891374, + "grad_norm": 0.06836584210395813, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8538 + }, + { + "epoch": 6.8202875399361025, + "grad_norm": 0.15930971503257751, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8539 + }, + { + "epoch": 6.821086261980831, + "grad_norm": 0.12306738644838333, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8540 + }, + { + "epoch": 6.821884984025559, + "grad_norm": 0.09868071228265762, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 8541 + }, + { + "epoch": 6.822683706070287, + "grad_norm": 0.09411876648664474, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8542 + }, + { + "epoch": 6.823482428115016, + "grad_norm": 0.09062112122774124, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8543 + }, + { + "epoch": 6.824281150159744, + "grad_norm": 0.14964330196380615, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8544 + }, + { + "epoch": 6.825079872204473, + "grad_norm": 0.1444161832332611, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8545 + }, + { + "epoch": 6.825878594249201, + "grad_norm": 0.15247556567192078, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8546 + }, + { + "epoch": 6.82667731629393, + "grad_norm": 0.1556181013584137, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 8547 + }, + { + "epoch": 6.827476038338658, + "grad_norm": 0.1781637817621231, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8548 + }, + { + "epoch": 6.828274760383387, + "grad_norm": 0.10066398978233337, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8549 + }, + { + "epoch": 6.8290734824281145, + "grad_norm": 3.0298452377319336, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8550 + }, + { + "epoch": 6.829872204472844, + "grad_norm": 0.2745296061038971, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8551 + }, + { + "epoch": 6.830670926517572, + "grad_norm": 0.4030947983264923, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 8552 + }, + { + "epoch": 6.8314696485623, + "grad_norm": 0.11019638180732727, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8553 + }, + { + "epoch": 6.832268370607029, + "grad_norm": 0.33687886595726013, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8554 + }, + { + "epoch": 6.833067092651757, + "grad_norm": 0.164499431848526, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8555 + }, + { + "epoch": 6.833865814696486, + "grad_norm": 0.31624776124954224, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8556 + }, + { + "epoch": 6.834664536741214, + "grad_norm": 0.24264110624790192, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8557 + }, + { + "epoch": 6.835463258785943, + "grad_norm": 0.19310493767261505, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8558 + }, + { + "epoch": 6.836261980830671, + "grad_norm": 0.2903575003147125, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 8559 + }, + { + "epoch": 6.8370607028754, + "grad_norm": 0.22584185004234314, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8560 + }, + { + "epoch": 6.837859424920127, + "grad_norm": 0.2400067150592804, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8561 + }, + { + "epoch": 6.838658146964856, + "grad_norm": 0.22543750703334808, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8562 + }, + { + "epoch": 6.8394568690095845, + "grad_norm": 0.2071310430765152, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8563 + }, + { + "epoch": 6.840255591054313, + "grad_norm": 0.07198980450630188, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8564 + }, + { + "epoch": 6.8410543130990416, + "grad_norm": 0.14733794331550598, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8565 + }, + { + "epoch": 6.84185303514377, + "grad_norm": 0.10259919613599777, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8566 + }, + { + "epoch": 6.842651757188499, + "grad_norm": 0.11961761116981506, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8567 + }, + { + "epoch": 6.843450479233227, + "grad_norm": 0.2714863121509552, + "learning_rate": 0.0005, + "loss": 1.0484, + "step": 8568 + }, + { + "epoch": 6.844249201277956, + "grad_norm": 0.23675218224525452, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8569 + }, + { + "epoch": 6.845047923322683, + "grad_norm": 0.17738480865955353, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8570 + }, + { + "epoch": 6.845846645367412, + "grad_norm": 0.2558303475379944, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8571 + }, + { + "epoch": 6.84664536741214, + "grad_norm": 0.19869430363178253, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8572 + }, + { + "epoch": 6.847444089456869, + "grad_norm": 0.15806829929351807, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8573 + }, + { + "epoch": 6.848242811501597, + "grad_norm": 0.12016306072473526, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8574 + }, + { + "epoch": 6.849041533546326, + "grad_norm": 0.10831576585769653, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8575 + }, + { + "epoch": 6.8498402555910545, + "grad_norm": 0.06762730330228806, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8576 + }, + { + "epoch": 6.850638977635783, + "grad_norm": 0.0824534222483635, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8577 + }, + { + "epoch": 6.8514376996805115, + "grad_norm": 0.20734307169914246, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8578 + }, + { + "epoch": 6.852236421725239, + "grad_norm": 0.22174668312072754, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8579 + }, + { + "epoch": 6.853035143769968, + "grad_norm": 0.05667027458548546, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8580 + }, + { + "epoch": 6.853833865814696, + "grad_norm": 0.2844708561897278, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8581 + }, + { + "epoch": 6.854632587859425, + "grad_norm": 0.21092848479747772, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 8582 + }, + { + "epoch": 6.855431309904153, + "grad_norm": 0.08843044936656952, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8583 + }, + { + "epoch": 6.856230031948882, + "grad_norm": 0.08862966299057007, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8584 + }, + { + "epoch": 6.85702875399361, + "grad_norm": 0.13263291120529175, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8585 + }, + { + "epoch": 6.857827476038339, + "grad_norm": 0.1969175636768341, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8586 + }, + { + "epoch": 6.858626198083067, + "grad_norm": 0.1299106925725937, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8587 + }, + { + "epoch": 6.859424920127795, + "grad_norm": 0.058154329657554626, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8588 + }, + { + "epoch": 6.8602236421725244, + "grad_norm": 0.06485166400671005, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8589 + }, + { + "epoch": 6.861022364217252, + "grad_norm": 6.880006313323975, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8590 + }, + { + "epoch": 6.861821086261981, + "grad_norm": 0.09929946064949036, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8591 + }, + { + "epoch": 6.862619808306709, + "grad_norm": 0.11197477579116821, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8592 + }, + { + "epoch": 6.863418530351438, + "grad_norm": 0.06740657985210419, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8593 + }, + { + "epoch": 6.864217252396166, + "grad_norm": 0.19594676792621613, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8594 + }, + { + "epoch": 6.865015974440895, + "grad_norm": 0.16844215989112854, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8595 + }, + { + "epoch": 6.865814696485623, + "grad_norm": 0.08980540931224823, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8596 + }, + { + "epoch": 6.866613418530352, + "grad_norm": 0.1263660043478012, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8597 + }, + { + "epoch": 6.86741214057508, + "grad_norm": 0.2000604271888733, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8598 + }, + { + "epoch": 6.868210862619808, + "grad_norm": 0.08987699449062347, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8599 + }, + { + "epoch": 6.8690095846645365, + "grad_norm": 0.12263453006744385, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8600 + }, + { + "epoch": 6.869808306709265, + "grad_norm": 0.1567721962928772, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8601 + }, + { + "epoch": 6.8706070287539935, + "grad_norm": 0.08756576478481293, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8602 + }, + { + "epoch": 6.871405750798722, + "grad_norm": 0.11816724389791489, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8603 + }, + { + "epoch": 6.872204472843451, + "grad_norm": 0.13798843324184418, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8604 + }, + { + "epoch": 6.873003194888179, + "grad_norm": 0.12364917248487473, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8605 + }, + { + "epoch": 6.873801916932908, + "grad_norm": 0.1200469508767128, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8606 + }, + { + "epoch": 6.874600638977636, + "grad_norm": 0.12144476920366287, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8607 + }, + { + "epoch": 6.875399361022364, + "grad_norm": 0.20083829760551453, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8608 + }, + { + "epoch": 6.876198083067092, + "grad_norm": 0.2817170023918152, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8609 + }, + { + "epoch": 6.876996805111821, + "grad_norm": 0.12137018889188766, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 8610 + }, + { + "epoch": 6.877795527156549, + "grad_norm": 0.09903489053249359, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8611 + }, + { + "epoch": 6.878594249201278, + "grad_norm": 0.17958515882492065, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8612 + }, + { + "epoch": 6.8793929712460065, + "grad_norm": 0.1041099801659584, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8613 + }, + { + "epoch": 6.880191693290735, + "grad_norm": 0.16099892556667328, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8614 + }, + { + "epoch": 6.8809904153354635, + "grad_norm": 0.061900194734334946, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8615 + }, + { + "epoch": 6.881789137380192, + "grad_norm": 0.1341199427843094, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8616 + }, + { + "epoch": 6.88258785942492, + "grad_norm": 0.12683184444904327, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8617 + }, + { + "epoch": 6.883386581469648, + "grad_norm": 0.08566799014806747, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8618 + }, + { + "epoch": 6.884185303514377, + "grad_norm": 0.1616903841495514, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 8619 + }, + { + "epoch": 6.884984025559105, + "grad_norm": 0.05832672119140625, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8620 + }, + { + "epoch": 6.885782747603834, + "grad_norm": 0.15186071395874023, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8621 + }, + { + "epoch": 6.886581469648562, + "grad_norm": 0.16585935652256012, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8622 + }, + { + "epoch": 6.887380191693291, + "grad_norm": 0.1267954260110855, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8623 + }, + { + "epoch": 6.888178913738019, + "grad_norm": 0.22396692633628845, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 8624 + }, + { + "epoch": 6.888977635782748, + "grad_norm": 0.133334219455719, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8625 + }, + { + "epoch": 6.8897763578274756, + "grad_norm": 0.1935819834470749, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8626 + }, + { + "epoch": 6.890575079872205, + "grad_norm": 0.32829585671424866, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8627 + }, + { + "epoch": 6.891373801916933, + "grad_norm": 0.231554314494133, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8628 + }, + { + "epoch": 6.892172523961661, + "grad_norm": 0.20693574845790863, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8629 + }, + { + "epoch": 6.89297124600639, + "grad_norm": 0.21037861704826355, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8630 + }, + { + "epoch": 6.893769968051118, + "grad_norm": 0.051133595407009125, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8631 + }, + { + "epoch": 6.894568690095847, + "grad_norm": 0.17635062336921692, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8632 + }, + { + "epoch": 6.895367412140575, + "grad_norm": 0.14592808485031128, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8633 + }, + { + "epoch": 6.896166134185304, + "grad_norm": 0.15353697538375854, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8634 + }, + { + "epoch": 6.896964856230032, + "grad_norm": 0.19556251168251038, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8635 + }, + { + "epoch": 6.897763578274761, + "grad_norm": 0.06867649406194687, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8636 + }, + { + "epoch": 6.8985623003194885, + "grad_norm": 0.15286169946193695, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8637 + }, + { + "epoch": 6.899361022364217, + "grad_norm": 0.28361746668815613, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8638 + }, + { + "epoch": 6.9001597444089455, + "grad_norm": 0.09351217746734619, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8639 + }, + { + "epoch": 6.900958466453674, + "grad_norm": 0.11050279438495636, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8640 + }, + { + "epoch": 6.901757188498403, + "grad_norm": 0.1648218333721161, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8641 + }, + { + "epoch": 6.902555910543131, + "grad_norm": 0.10323848575353622, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8642 + }, + { + "epoch": 6.90335463258786, + "grad_norm": 0.14925505220890045, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8643 + }, + { + "epoch": 6.904153354632588, + "grad_norm": 0.05877414718270302, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8644 + }, + { + "epoch": 6.904952076677317, + "grad_norm": 0.3324354290962219, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8645 + }, + { + "epoch": 6.905750798722044, + "grad_norm": 0.22756889462471008, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8646 + }, + { + "epoch": 6.906549520766773, + "grad_norm": 0.1040947288274765, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 8647 + }, + { + "epoch": 6.907348242811501, + "grad_norm": 0.1310190111398697, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8648 + }, + { + "epoch": 6.90814696485623, + "grad_norm": 0.09484609216451645, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8649 + }, + { + "epoch": 6.9089456869009584, + "grad_norm": 0.13337384164333344, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8650 + }, + { + "epoch": 6.909744408945687, + "grad_norm": 0.31157273054122925, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8651 + }, + { + "epoch": 6.9105431309904155, + "grad_norm": 0.15081669390201569, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8652 + }, + { + "epoch": 6.911341853035144, + "grad_norm": 0.14120221138000488, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8653 + }, + { + "epoch": 6.912140575079873, + "grad_norm": 0.6128084659576416, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8654 + }, + { + "epoch": 6.9129392971246, + "grad_norm": 0.6915252208709717, + "learning_rate": 0.0005, + "loss": 1.0877, + "step": 8655 + }, + { + "epoch": 6.913738019169329, + "grad_norm": 0.7245156168937683, + "learning_rate": 0.0005, + "loss": 1.0852, + "step": 8656 + }, + { + "epoch": 6.914536741214057, + "grad_norm": 0.8400923013687134, + "learning_rate": 0.0005, + "loss": 1.1021, + "step": 8657 + }, + { + "epoch": 6.915335463258786, + "grad_norm": 0.3218044340610504, + "learning_rate": 0.0005, + "loss": 1.1046, + "step": 8658 + }, + { + "epoch": 6.916134185303514, + "grad_norm": 0.3119533061981201, + "learning_rate": 0.0005, + "loss": 1.1011, + "step": 8659 + }, + { + "epoch": 6.916932907348243, + "grad_norm": 0.2192138433456421, + "learning_rate": 0.0005, + "loss": 1.0871, + "step": 8660 + }, + { + "epoch": 6.917731629392971, + "grad_norm": 0.36212611198425293, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 8661 + }, + { + "epoch": 6.9185303514377, + "grad_norm": 0.13674713671207428, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8662 + }, + { + "epoch": 6.919329073482428, + "grad_norm": 0.24960070848464966, + "learning_rate": 0.0005, + "loss": 1.0823, + "step": 8663 + }, + { + "epoch": 6.920127795527156, + "grad_norm": 0.16797062754631042, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8664 + }, + { + "epoch": 6.9209265175718855, + "grad_norm": 0.23811157047748566, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 8665 + }, + { + "epoch": 6.921725239616613, + "grad_norm": 0.25372570753097534, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 8666 + }, + { + "epoch": 6.922523961661342, + "grad_norm": 0.13954615592956543, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 8667 + }, + { + "epoch": 6.92332268370607, + "grad_norm": 0.17769959568977356, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8668 + }, + { + "epoch": 6.924121405750799, + "grad_norm": 0.14327546954154968, + "learning_rate": 0.0005, + "loss": 1.0716, + "step": 8669 + }, + { + "epoch": 6.924920127795527, + "grad_norm": 0.07454083859920502, + "learning_rate": 0.0005, + "loss": 1.0668, + "step": 8670 + }, + { + "epoch": 6.925718849840256, + "grad_norm": 0.18561266362667084, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8671 + }, + { + "epoch": 6.926517571884984, + "grad_norm": 0.11927005648612976, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 8672 + }, + { + "epoch": 6.927316293929713, + "grad_norm": 0.06790865212678909, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 8673 + }, + { + "epoch": 6.928115015974441, + "grad_norm": 0.22627630829811096, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8674 + }, + { + "epoch": 6.928913738019169, + "grad_norm": 0.21341092884540558, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8675 + }, + { + "epoch": 6.9297124600638975, + "grad_norm": 0.19292457401752472, + "learning_rate": 0.0005, + "loss": 1.069, + "step": 8676 + }, + { + "epoch": 6.930511182108626, + "grad_norm": 0.15046356618404388, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8677 + }, + { + "epoch": 6.931309904153355, + "grad_norm": 0.13845203816890717, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8678 + }, + { + "epoch": 6.932108626198083, + "grad_norm": 0.18034739792346954, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8679 + }, + { + "epoch": 6.932907348242812, + "grad_norm": 0.3970269560813904, + "learning_rate": 0.0005, + "loss": 1.0675, + "step": 8680 + }, + { + "epoch": 6.93370607028754, + "grad_norm": 0.133075550198555, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8681 + }, + { + "epoch": 6.934504792332269, + "grad_norm": 0.13149690628051758, + "learning_rate": 0.0005, + "loss": 1.0679, + "step": 8682 + }, + { + "epoch": 6.935303514376997, + "grad_norm": 0.1332010179758072, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8683 + }, + { + "epoch": 6.936102236421725, + "grad_norm": 0.13125883042812347, + "learning_rate": 0.0005, + "loss": 1.0692, + "step": 8684 + }, + { + "epoch": 6.936900958466453, + "grad_norm": 0.5500382781028748, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 8685 + }, + { + "epoch": 6.937699680511182, + "grad_norm": 0.09766851365566254, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8686 + }, + { + "epoch": 6.93849840255591, + "grad_norm": 0.10732626169919968, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8687 + }, + { + "epoch": 6.939297124600639, + "grad_norm": 0.10059154033660889, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8688 + }, + { + "epoch": 6.9400958466453675, + "grad_norm": 0.09518695622682571, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8689 + }, + { + "epoch": 6.940894568690096, + "grad_norm": 0.1279720813035965, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8690 + }, + { + "epoch": 6.9416932907348246, + "grad_norm": 0.0997946485877037, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8691 + }, + { + "epoch": 6.942492012779553, + "grad_norm": 0.08584152907133102, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8692 + }, + { + "epoch": 6.943290734824281, + "grad_norm": 0.06987651437520981, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8693 + }, + { + "epoch": 6.944089456869009, + "grad_norm": 0.10446512699127197, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8694 + }, + { + "epoch": 6.944888178913738, + "grad_norm": 0.08535288274288177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8695 + }, + { + "epoch": 6.945686900958466, + "grad_norm": 0.15912187099456787, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8696 + }, + { + "epoch": 6.946485623003195, + "grad_norm": 0.20139484107494354, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8697 + }, + { + "epoch": 6.947284345047923, + "grad_norm": 0.10153409093618393, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8698 + }, + { + "epoch": 6.948083067092652, + "grad_norm": 0.04925902560353279, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8699 + }, + { + "epoch": 6.94888178913738, + "grad_norm": 0.13896742463111877, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8700 + }, + { + "epoch": 6.949680511182109, + "grad_norm": 0.07297761738300323, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8701 + }, + { + "epoch": 6.950479233226837, + "grad_norm": 0.09260845929384232, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8702 + }, + { + "epoch": 6.951277955271565, + "grad_norm": 0.11840535700321198, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8703 + }, + { + "epoch": 6.952076677316294, + "grad_norm": 0.17365501821041107, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 8704 + }, + { + "epoch": 6.952875399361022, + "grad_norm": 0.1369183212518692, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8705 + }, + { + "epoch": 6.953674121405751, + "grad_norm": 0.11277196556329727, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8706 + }, + { + "epoch": 6.954472843450479, + "grad_norm": 0.11032512784004211, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8707 + }, + { + "epoch": 6.955271565495208, + "grad_norm": 0.12437347322702408, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8708 + }, + { + "epoch": 6.956070287539936, + "grad_norm": 0.08772306144237518, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8709 + }, + { + "epoch": 6.956869009584665, + "grad_norm": 0.05245213583111763, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8710 + }, + { + "epoch": 6.957667731629393, + "grad_norm": 0.1591174304485321, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8711 + }, + { + "epoch": 6.958466453674122, + "grad_norm": 0.21121510863304138, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8712 + }, + { + "epoch": 6.9592651757188495, + "grad_norm": 0.11379709839820862, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8713 + }, + { + "epoch": 6.960063897763578, + "grad_norm": 0.10083793848752975, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 8714 + }, + { + "epoch": 6.960862619808307, + "grad_norm": 0.0790674164891243, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8715 + }, + { + "epoch": 6.961661341853035, + "grad_norm": 0.13917089998722076, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8716 + }, + { + "epoch": 6.962460063897764, + "grad_norm": 0.18794408440589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 8717 + }, + { + "epoch": 6.963258785942492, + "grad_norm": 0.10725098103284836, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 8718 + }, + { + "epoch": 6.964057507987221, + "grad_norm": 0.14577186107635498, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8719 + }, + { + "epoch": 6.964856230031949, + "grad_norm": 0.06711703538894653, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 8720 + }, + { + "epoch": 6.965654952076678, + "grad_norm": 0.20572635531425476, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8721 + }, + { + "epoch": 6.966453674121405, + "grad_norm": 0.13693936169147491, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8722 + }, + { + "epoch": 6.967252396166134, + "grad_norm": 0.05642275512218475, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8723 + }, + { + "epoch": 6.968051118210862, + "grad_norm": 0.09080768376588821, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8724 + }, + { + "epoch": 6.968849840255591, + "grad_norm": 0.05295126140117645, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8725 + }, + { + "epoch": 6.9696485623003195, + "grad_norm": 0.11833932250738144, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8726 + }, + { + "epoch": 6.970447284345048, + "grad_norm": 0.12110085785388947, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8727 + }, + { + "epoch": 6.9712460063897765, + "grad_norm": 0.10044527053833008, + "learning_rate": 0.0005, + "loss": 1.0473, + "step": 8728 + }, + { + "epoch": 6.972044728434505, + "grad_norm": 0.13638640940189362, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8729 + }, + { + "epoch": 6.972843450479234, + "grad_norm": 0.18118594586849213, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8730 + }, + { + "epoch": 6.973642172523961, + "grad_norm": 0.1394396871328354, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8731 + }, + { + "epoch": 6.97444089456869, + "grad_norm": 0.14276480674743652, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8732 + }, + { + "epoch": 6.975239616613418, + "grad_norm": 0.2213817834854126, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8733 + }, + { + "epoch": 6.976038338658147, + "grad_norm": 0.11497826874256134, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 8734 + }, + { + "epoch": 6.976837060702875, + "grad_norm": 0.11436138302087784, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8735 + }, + { + "epoch": 6.977635782747604, + "grad_norm": 0.08433762192726135, + "learning_rate": 0.0005, + "loss": 1.0634, + "step": 8736 + }, + { + "epoch": 6.978434504792332, + "grad_norm": 0.1584242880344391, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8737 + }, + { + "epoch": 6.979233226837061, + "grad_norm": 0.09111067652702332, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8738 + }, + { + "epoch": 6.9800319488817895, + "grad_norm": 0.09075064212083817, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8739 + }, + { + "epoch": 6.980830670926517, + "grad_norm": 0.08456333726644516, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8740 + }, + { + "epoch": 6.981629392971246, + "grad_norm": 0.08090690523386002, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8741 + }, + { + "epoch": 6.982428115015974, + "grad_norm": 0.42019179463386536, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 8742 + }, + { + "epoch": 6.983226837060703, + "grad_norm": 0.119536854326725, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8743 + }, + { + "epoch": 6.984025559105431, + "grad_norm": 0.08138761669397354, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 8744 + }, + { + "epoch": 6.98482428115016, + "grad_norm": 0.5337278246879578, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8745 + }, + { + "epoch": 6.985623003194888, + "grad_norm": 0.1773308366537094, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 8746 + }, + { + "epoch": 6.986421725239617, + "grad_norm": 0.10939478129148483, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8747 + }, + { + "epoch": 6.987220447284345, + "grad_norm": 0.18635793030261993, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8748 + }, + { + "epoch": 6.988019169329074, + "grad_norm": 0.11675454676151276, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8749 + }, + { + "epoch": 6.988817891373802, + "grad_norm": 0.11787068843841553, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8750 + }, + { + "epoch": 6.98961661341853, + "grad_norm": 0.2457057386636734, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8751 + }, + { + "epoch": 6.9904153354632586, + "grad_norm": 0.05914906784892082, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 8752 + }, + { + "epoch": 6.991214057507987, + "grad_norm": 0.1494094878435135, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 8753 + }, + { + "epoch": 6.992012779552716, + "grad_norm": 0.14485910534858704, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8754 + }, + { + "epoch": 6.992811501597444, + "grad_norm": 1.2348047494888306, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8755 + }, + { + "epoch": 6.993610223642173, + "grad_norm": 0.1546175330877304, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8756 + }, + { + "epoch": 6.994408945686901, + "grad_norm": 0.13474640250205994, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 8757 + }, + { + "epoch": 6.99520766773163, + "grad_norm": 0.5535407662391663, + "learning_rate": 0.0005, + "loss": 1.0483, + "step": 8758 + }, + { + "epoch": 6.996006389776358, + "grad_norm": 0.10516832023859024, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 8759 + }, + { + "epoch": 6.996805111821086, + "grad_norm": 0.07872752100229263, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8760 + }, + { + "epoch": 6.997603833865814, + "grad_norm": 0.08130715042352676, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 8761 + }, + { + "epoch": 6.998402555910543, + "grad_norm": 0.09496142715215683, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 8762 + }, + { + "epoch": 6.9992012779552715, + "grad_norm": 0.06645053625106812, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8763 + }, + { + "epoch": 7.0, + "grad_norm": 0.07332758605480194, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8764 + }, + { + "epoch": 7.0007987220447285, + "grad_norm": 0.09108536690473557, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8765 + }, + { + "epoch": 7.001597444089457, + "grad_norm": 0.13202883303165436, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8766 + }, + { + "epoch": 7.002396166134186, + "grad_norm": 0.09079252928495407, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 8767 + }, + { + "epoch": 7.003194888178914, + "grad_norm": 0.1004822626709938, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8768 + }, + { + "epoch": 7.003993610223642, + "grad_norm": 0.05096781253814697, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8769 + }, + { + "epoch": 7.00479233226837, + "grad_norm": 0.14213396608829498, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8770 + }, + { + "epoch": 7.005591054313099, + "grad_norm": 0.11614344269037247, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8771 + }, + { + "epoch": 7.006389776357827, + "grad_norm": 0.1144147664308548, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 8772 + }, + { + "epoch": 7.007188498402556, + "grad_norm": 0.1504330188035965, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8773 + }, + { + "epoch": 7.007987220447284, + "grad_norm": 0.10443079471588135, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8774 + }, + { + "epoch": 7.008785942492013, + "grad_norm": 0.166890949010849, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 8775 + }, + { + "epoch": 7.0095846645367414, + "grad_norm": 0.12496565282344818, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8776 + }, + { + "epoch": 7.01038338658147, + "grad_norm": 0.12851381301879883, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8777 + }, + { + "epoch": 7.0111821086261985, + "grad_norm": 0.20198717713356018, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 8778 + }, + { + "epoch": 7.011980830670926, + "grad_norm": 0.10324864089488983, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8779 + }, + { + "epoch": 7.012779552715655, + "grad_norm": 0.12864094972610474, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8780 + }, + { + "epoch": 7.013578274760383, + "grad_norm": 0.11301549524068832, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8781 + }, + { + "epoch": 7.014376996805112, + "grad_norm": 0.13162367045879364, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8782 + }, + { + "epoch": 7.01517571884984, + "grad_norm": 0.1574760377407074, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 8783 + }, + { + "epoch": 7.015974440894569, + "grad_norm": 0.07471634447574615, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8784 + }, + { + "epoch": 7.016773162939297, + "grad_norm": 0.09653516113758087, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8785 + }, + { + "epoch": 7.017571884984026, + "grad_norm": 0.13719993829727173, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8786 + }, + { + "epoch": 7.018370607028754, + "grad_norm": 0.10545443743467331, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8787 + }, + { + "epoch": 7.019169329073482, + "grad_norm": 0.1147511675953865, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 8788 + }, + { + "epoch": 7.0199680511182105, + "grad_norm": 0.14005234837532043, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8789 + }, + { + "epoch": 7.020766773162939, + "grad_norm": 0.36956554651260376, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8790 + }, + { + "epoch": 7.021565495207668, + "grad_norm": 0.1384177953004837, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 8791 + }, + { + "epoch": 7.022364217252396, + "grad_norm": 0.062106356024742126, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 8792 + }, + { + "epoch": 7.023162939297125, + "grad_norm": 0.14074385166168213, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8793 + }, + { + "epoch": 7.023961661341853, + "grad_norm": 0.18152809143066406, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8794 + }, + { + "epoch": 7.024760383386582, + "grad_norm": 0.11607832461595535, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8795 + }, + { + "epoch": 7.02555910543131, + "grad_norm": 0.06603241711854935, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 8796 + }, + { + "epoch": 7.026357827476039, + "grad_norm": 0.08846289664506912, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8797 + }, + { + "epoch": 7.027156549520766, + "grad_norm": 0.09882134944200516, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8798 + }, + { + "epoch": 7.027955271565495, + "grad_norm": 0.11535032093524933, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8799 + }, + { + "epoch": 7.0287539936102235, + "grad_norm": 0.10153281688690186, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8800 + }, + { + "epoch": 7.029552715654952, + "grad_norm": 0.11195418983697891, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8801 + }, + { + "epoch": 7.0303514376996805, + "grad_norm": 0.5721603035926819, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 8802 + }, + { + "epoch": 7.031150159744409, + "grad_norm": 0.18006286025047302, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8803 + }, + { + "epoch": 7.031948881789138, + "grad_norm": 0.16561086475849152, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8804 + }, + { + "epoch": 7.032747603833866, + "grad_norm": 0.11010444164276123, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8805 + }, + { + "epoch": 7.033546325878595, + "grad_norm": 0.17741475999355316, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 8806 + }, + { + "epoch": 7.034345047923322, + "grad_norm": 0.09941161423921585, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 8807 + }, + { + "epoch": 7.035143769968051, + "grad_norm": 0.20474617183208466, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8808 + }, + { + "epoch": 7.035942492012779, + "grad_norm": 0.07972154021263123, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8809 + }, + { + "epoch": 7.036741214057508, + "grad_norm": 0.17856109142303467, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 8810 + }, + { + "epoch": 7.037539936102236, + "grad_norm": 0.1276514083147049, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 8811 + }, + { + "epoch": 7.038338658146965, + "grad_norm": 0.08009849488735199, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 8812 + }, + { + "epoch": 7.039137380191693, + "grad_norm": 0.09832913428544998, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 8813 + }, + { + "epoch": 7.039936102236422, + "grad_norm": 0.06454402953386307, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8814 + }, + { + "epoch": 7.0407348242811505, + "grad_norm": 0.20843401551246643, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8815 + }, + { + "epoch": 7.041533546325879, + "grad_norm": 0.14909301698207855, + "learning_rate": 0.0005, + "loss": 1.051, + "step": 8816 + }, + { + "epoch": 7.042332268370607, + "grad_norm": 0.08815812319517136, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8817 + }, + { + "epoch": 7.043130990415335, + "grad_norm": 0.18957766890525818, + "learning_rate": 0.0005, + "loss": 1.0474, + "step": 8818 + }, + { + "epoch": 7.043929712460064, + "grad_norm": 0.33018213510513306, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8819 + }, + { + "epoch": 7.044728434504792, + "grad_norm": 0.11069374531507492, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 8820 + }, + { + "epoch": 7.045527156549521, + "grad_norm": 0.3001084625720978, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8821 + }, + { + "epoch": 7.046325878594249, + "grad_norm": 0.0704922303557396, + "learning_rate": 0.0005, + "loss": 1.0436, + "step": 8822 + }, + { + "epoch": 7.047124600638978, + "grad_norm": 0.08537211269140244, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8823 + }, + { + "epoch": 7.047923322683706, + "grad_norm": 0.08765899389982224, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 8824 + }, + { + "epoch": 7.048722044728435, + "grad_norm": 0.14218255877494812, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8825 + }, + { + "epoch": 7.0495207667731625, + "grad_norm": 0.08026671409606934, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8826 + }, + { + "epoch": 7.050319488817891, + "grad_norm": 0.07170549035072327, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8827 + }, + { + "epoch": 7.05111821086262, + "grad_norm": 1.2578401565551758, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8828 + }, + { + "epoch": 7.051916932907348, + "grad_norm": 0.20149891078472137, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 8829 + }, + { + "epoch": 7.052715654952077, + "grad_norm": 0.18734677135944366, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8830 + }, + { + "epoch": 7.053514376996805, + "grad_norm": 0.08732877671718597, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8831 + }, + { + "epoch": 7.054313099041534, + "grad_norm": 0.1895754486322403, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 8832 + }, + { + "epoch": 7.055111821086262, + "grad_norm": 0.06839644908905029, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8833 + }, + { + "epoch": 7.055910543130991, + "grad_norm": 4.666222095489502, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 8834 + }, + { + "epoch": 7.056709265175719, + "grad_norm": 0.2801821231842041, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8835 + }, + { + "epoch": 7.057507987220447, + "grad_norm": 0.3428499102592468, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8836 + }, + { + "epoch": 7.0583067092651754, + "grad_norm": 0.16896478831768036, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8837 + }, + { + "epoch": 7.059105431309904, + "grad_norm": 1.21062171459198, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 8838 + }, + { + "epoch": 7.0599041533546325, + "grad_norm": 0.20507270097732544, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8839 + }, + { + "epoch": 7.060702875399361, + "grad_norm": 0.34736308455467224, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8840 + }, + { + "epoch": 7.06150159744409, + "grad_norm": 0.13628798723220825, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8841 + }, + { + "epoch": 7.062300319488818, + "grad_norm": 0.3212411403656006, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8842 + }, + { + "epoch": 7.063099041533547, + "grad_norm": 0.23049144446849823, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 8843 + }, + { + "epoch": 7.063897763578275, + "grad_norm": 0.2785285413265228, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8844 + }, + { + "epoch": 7.064696485623003, + "grad_norm": 0.32158368825912476, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8845 + }, + { + "epoch": 7.065495207667731, + "grad_norm": 0.40443500876426697, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8846 + }, + { + "epoch": 7.06629392971246, + "grad_norm": 0.20072752237319946, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8847 + }, + { + "epoch": 7.067092651757188, + "grad_norm": 0.38166266679763794, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8848 + }, + { + "epoch": 7.067891373801917, + "grad_norm": 0.2771472930908203, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 8849 + }, + { + "epoch": 7.068690095846645, + "grad_norm": 0.10485964268445969, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8850 + }, + { + "epoch": 7.069488817891374, + "grad_norm": 0.17424215376377106, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8851 + }, + { + "epoch": 7.0702875399361025, + "grad_norm": 0.0972314327955246, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8852 + }, + { + "epoch": 7.071086261980831, + "grad_norm": 0.18021832406520844, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 8853 + }, + { + "epoch": 7.0718849840255595, + "grad_norm": 0.08820143342018127, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8854 + }, + { + "epoch": 7.072683706070287, + "grad_norm": 0.1785898506641388, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8855 + }, + { + "epoch": 7.073482428115016, + "grad_norm": 0.3515510559082031, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 8856 + }, + { + "epoch": 7.074281150159744, + "grad_norm": 0.1787438541650772, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8857 + }, + { + "epoch": 7.075079872204473, + "grad_norm": 0.16761353611946106, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8858 + }, + { + "epoch": 7.075878594249201, + "grad_norm": 0.5075165629386902, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8859 + }, + { + "epoch": 7.07667731629393, + "grad_norm": 0.13462364673614502, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8860 + }, + { + "epoch": 7.077476038338658, + "grad_norm": 0.20478707551956177, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8861 + }, + { + "epoch": 7.078274760383387, + "grad_norm": 0.14689947664737701, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 8862 + }, + { + "epoch": 7.079073482428115, + "grad_norm": 0.36265847086906433, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 8863 + }, + { + "epoch": 7.079872204472843, + "grad_norm": 0.18443043529987335, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8864 + }, + { + "epoch": 7.080670926517572, + "grad_norm": 0.04789111018180847, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8865 + }, + { + "epoch": 7.0814696485623, + "grad_norm": 0.18024222552776337, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8866 + }, + { + "epoch": 7.082268370607029, + "grad_norm": 0.08901690691709518, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 8867 + }, + { + "epoch": 7.083067092651757, + "grad_norm": 0.20689153671264648, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8868 + }, + { + "epoch": 7.083865814696486, + "grad_norm": 0.15572768449783325, + "learning_rate": 0.0005, + "loss": 1.0426, + "step": 8869 + }, + { + "epoch": 7.084664536741214, + "grad_norm": 0.2915050685405731, + "learning_rate": 0.0005, + "loss": 1.0493, + "step": 8870 + }, + { + "epoch": 7.085463258785943, + "grad_norm": 0.12404290586709976, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8871 + }, + { + "epoch": 7.086261980830671, + "grad_norm": 0.19628335535526276, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8872 + }, + { + "epoch": 7.0870607028754, + "grad_norm": 0.6693617105484009, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8873 + }, + { + "epoch": 7.087859424920127, + "grad_norm": 0.21526481211185455, + "learning_rate": 0.0005, + "loss": 1.0618, + "step": 8874 + }, + { + "epoch": 7.088658146964856, + "grad_norm": 0.2779954969882965, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8875 + }, + { + "epoch": 7.0894568690095845, + "grad_norm": 0.14111320674419403, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 8876 + }, + { + "epoch": 7.090255591054313, + "grad_norm": 0.26465079188346863, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 8877 + }, + { + "epoch": 7.0910543130990416, + "grad_norm": 0.12354349344968796, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 8878 + }, + { + "epoch": 7.09185303514377, + "grad_norm": 0.18360896408557892, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 8879 + }, + { + "epoch": 7.092651757188499, + "grad_norm": 0.26844218373298645, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8880 + }, + { + "epoch": 7.093450479233227, + "grad_norm": 0.34032055735588074, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8881 + }, + { + "epoch": 7.094249201277956, + "grad_norm": 0.2372630089521408, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 8882 + }, + { + "epoch": 7.095047923322683, + "grad_norm": 0.4134571850299835, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 8883 + }, + { + "epoch": 7.095846645367412, + "grad_norm": 0.21220949292182922, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 8884 + }, + { + "epoch": 7.09664536741214, + "grad_norm": 0.20073527097702026, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 8885 + }, + { + "epoch": 7.097444089456869, + "grad_norm": 0.1583309918642044, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 8886 + }, + { + "epoch": 7.098242811501597, + "grad_norm": 0.4032151401042938, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 8887 + }, + { + "epoch": 7.099041533546326, + "grad_norm": 0.09527560323476791, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 8888 + }, + { + "epoch": 7.0998402555910545, + "grad_norm": 0.2630043625831604, + "learning_rate": 0.0005, + "loss": 1.0574, + "step": 8889 + }, + { + "epoch": 7.100638977635783, + "grad_norm": 0.06699138134717941, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8890 + }, + { + "epoch": 7.1014376996805115, + "grad_norm": 0.34307003021240234, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8891 + }, + { + "epoch": 7.102236421725239, + "grad_norm": 0.24538451433181763, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8892 + }, + { + "epoch": 7.103035143769968, + "grad_norm": 0.2794513702392578, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8893 + }, + { + "epoch": 7.103833865814696, + "grad_norm": 0.20586012303829193, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 8894 + }, + { + "epoch": 7.104632587859425, + "grad_norm": 0.22349807620048523, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 8895 + }, + { + "epoch": 7.105431309904153, + "grad_norm": 0.31171584129333496, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 8896 + }, + { + "epoch": 7.106230031948882, + "grad_norm": 0.07461030781269073, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 8897 + }, + { + "epoch": 7.10702875399361, + "grad_norm": 0.24280597269535065, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 8898 + }, + { + "epoch": 7.107827476038339, + "grad_norm": 0.13005708158016205, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8899 + }, + { + "epoch": 7.108626198083067, + "grad_norm": 0.24730080366134644, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8900 + }, + { + "epoch": 7.109424920127796, + "grad_norm": 1.287341833114624, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 8901 + }, + { + "epoch": 7.110223642172524, + "grad_norm": 0.15945735573768616, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 8902 + }, + { + "epoch": 7.111022364217252, + "grad_norm": 0.09943541884422302, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 8903 + }, + { + "epoch": 7.111821086261981, + "grad_norm": 0.12183468043804169, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8904 + }, + { + "epoch": 7.112619808306709, + "grad_norm": 0.11859191954135895, + "learning_rate": 0.0005, + "loss": 1.054, + "step": 8905 + }, + { + "epoch": 7.113418530351438, + "grad_norm": 0.27701425552368164, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8906 + }, + { + "epoch": 7.114217252396166, + "grad_norm": 0.14724725484848022, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 8907 + }, + { + "epoch": 7.115015974440895, + "grad_norm": 0.1342400461435318, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 8908 + }, + { + "epoch": 7.115814696485623, + "grad_norm": 0.15474970638751984, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8909 + }, + { + "epoch": 7.116613418530352, + "grad_norm": 0.1276721954345703, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 8910 + }, + { + "epoch": 7.11741214057508, + "grad_norm": 0.14511124789714813, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8911 + }, + { + "epoch": 7.118210862619808, + "grad_norm": 0.10112027823925018, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 8912 + }, + { + "epoch": 7.1190095846645365, + "grad_norm": 0.17296795547008514, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8913 + }, + { + "epoch": 7.119808306709265, + "grad_norm": 0.09542828798294067, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8914 + }, + { + "epoch": 7.1206070287539935, + "grad_norm": 0.17453183233737946, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 8915 + }, + { + "epoch": 7.121405750798722, + "grad_norm": 0.13417603075504303, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 8916 + }, + { + "epoch": 7.122204472843451, + "grad_norm": 0.26239508390426636, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8917 + }, + { + "epoch": 7.123003194888179, + "grad_norm": 0.13963834941387177, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8918 + }, + { + "epoch": 7.123801916932908, + "grad_norm": 0.18642054498195648, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8919 + }, + { + "epoch": 7.124600638977636, + "grad_norm": 0.17754590511322021, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8920 + }, + { + "epoch": 7.125399361022364, + "grad_norm": 0.1010628268122673, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 8921 + }, + { + "epoch": 7.126198083067092, + "grad_norm": 0.1621905416250229, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 8922 + }, + { + "epoch": 7.126996805111821, + "grad_norm": 0.3069966733455658, + "learning_rate": 0.0005, + "loss": 1.0614, + "step": 8923 + }, + { + "epoch": 7.127795527156549, + "grad_norm": 0.2312333881855011, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 8924 + }, + { + "epoch": 7.128594249201278, + "grad_norm": 0.20297785103321075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 8925 + }, + { + "epoch": 7.1293929712460065, + "grad_norm": 0.18856601417064667, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8926 + }, + { + "epoch": 7.130191693290735, + "grad_norm": 0.19353985786437988, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 8927 + }, + { + "epoch": 7.1309904153354635, + "grad_norm": 0.08276687562465668, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 8928 + }, + { + "epoch": 7.131789137380192, + "grad_norm": 0.31372779607772827, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 8929 + }, + { + "epoch": 7.13258785942492, + "grad_norm": 0.10208959877490997, + "learning_rate": 0.0005, + "loss": 1.0499, + "step": 8930 + }, + { + "epoch": 7.133386581469648, + "grad_norm": 0.1636659801006317, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 8931 + }, + { + "epoch": 7.134185303514377, + "grad_norm": 0.14321425557136536, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8932 + }, + { + "epoch": 7.134984025559105, + "grad_norm": 0.08438511192798615, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 8933 + }, + { + "epoch": 7.135782747603834, + "grad_norm": 0.17451012134552002, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8934 + }, + { + "epoch": 7.136581469648562, + "grad_norm": 0.06913795322179794, + "learning_rate": 0.0005, + "loss": 1.0496, + "step": 8935 + }, + { + "epoch": 7.137380191693291, + "grad_norm": 0.14176666736602783, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 8936 + }, + { + "epoch": 7.138178913738019, + "grad_norm": 0.15005643665790558, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 8937 + }, + { + "epoch": 7.138977635782748, + "grad_norm": 0.08884457498788834, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 8938 + }, + { + "epoch": 7.139776357827476, + "grad_norm": 0.19651612639427185, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 8939 + }, + { + "epoch": 7.140575079872204, + "grad_norm": 0.12419132143259048, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8940 + }, + { + "epoch": 7.141373801916933, + "grad_norm": 0.08800125867128372, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 8941 + }, + { + "epoch": 7.142172523961661, + "grad_norm": 0.12308578193187714, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8942 + }, + { + "epoch": 7.14297124600639, + "grad_norm": 0.06376682221889496, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 8943 + }, + { + "epoch": 7.143769968051118, + "grad_norm": 0.08467467129230499, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8944 + }, + { + "epoch": 7.144568690095847, + "grad_norm": 0.05492696538567543, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 8945 + }, + { + "epoch": 7.145367412140575, + "grad_norm": 0.12659363448619843, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8946 + }, + { + "epoch": 7.146166134185304, + "grad_norm": 0.11025204509496689, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 8947 + }, + { + "epoch": 7.146964856230032, + "grad_norm": 0.03672007843852043, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 8948 + }, + { + "epoch": 7.147763578274761, + "grad_norm": 0.06386546790599823, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 8949 + }, + { + "epoch": 7.1485623003194885, + "grad_norm": 0.05484751984477043, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 8950 + }, + { + "epoch": 7.149361022364217, + "grad_norm": 0.08663280308246613, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 8951 + }, + { + "epoch": 7.1501597444089455, + "grad_norm": 0.10515031963586807, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 8952 + }, + { + "epoch": 7.150958466453674, + "grad_norm": 0.05844622105360031, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 8953 + }, + { + "epoch": 7.151757188498403, + "grad_norm": 0.061575960367918015, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 8954 + }, + { + "epoch": 7.152555910543131, + "grad_norm": 0.30169913172721863, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 8955 + }, + { + "epoch": 7.15335463258786, + "grad_norm": 0.15433792769908905, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 8956 + }, + { + "epoch": 7.154153354632588, + "grad_norm": 0.11872339993715286, + "learning_rate": 0.0005, + "loss": 1.0494, + "step": 8957 + }, + { + "epoch": 7.154952076677317, + "grad_norm": 0.4086587131023407, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 8958 + }, + { + "epoch": 7.155750798722044, + "grad_norm": 0.0976172536611557, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 8959 + }, + { + "epoch": 7.156549520766773, + "grad_norm": 0.11132699996232986, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8960 + }, + { + "epoch": 7.157348242811501, + "grad_norm": 0.11129645258188248, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 8961 + }, + { + "epoch": 7.15814696485623, + "grad_norm": 0.09004200249910355, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 8962 + }, + { + "epoch": 7.1589456869009584, + "grad_norm": 0.1225908026099205, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 8963 + }, + { + "epoch": 7.159744408945687, + "grad_norm": 0.10531286895275116, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 8964 + }, + { + "epoch": 7.1605431309904155, + "grad_norm": 0.1054515391588211, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 8965 + }, + { + "epoch": 7.161341853035144, + "grad_norm": 0.11718834936618805, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8966 + }, + { + "epoch": 7.162140575079873, + "grad_norm": 0.11314168572425842, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 8967 + }, + { + "epoch": 7.1629392971246, + "grad_norm": 0.1017487570643425, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 8968 + }, + { + "epoch": 7.163738019169329, + "grad_norm": 0.05381032079458237, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 8969 + }, + { + "epoch": 7.164536741214057, + "grad_norm": 0.1527879238128662, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 8970 + }, + { + "epoch": 7.165335463258786, + "grad_norm": 0.05352415144443512, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 8971 + }, + { + "epoch": 7.166134185303514, + "grad_norm": 0.17179784178733826, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 8972 + }, + { + "epoch": 7.166932907348243, + "grad_norm": 0.24629469215869904, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 8973 + }, + { + "epoch": 7.167731629392971, + "grad_norm": 0.11276146024465561, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 8974 + }, + { + "epoch": 7.1685303514377, + "grad_norm": 0.0927032083272934, + "learning_rate": 0.0005, + "loss": 1.0468, + "step": 8975 + }, + { + "epoch": 7.169329073482428, + "grad_norm": 0.0978626236319542, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 8976 + }, + { + "epoch": 7.170127795527157, + "grad_norm": 0.12577946484088898, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 8977 + }, + { + "epoch": 7.170926517571885, + "grad_norm": 0.1014678105711937, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 8978 + }, + { + "epoch": 7.171725239616613, + "grad_norm": 0.08706190437078476, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 8979 + }, + { + "epoch": 7.172523961661342, + "grad_norm": 0.06214338168501854, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 8980 + }, + { + "epoch": 7.17332268370607, + "grad_norm": 0.08223161101341248, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 8981 + }, + { + "epoch": 7.174121405750799, + "grad_norm": 0.3143157362937927, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 8982 + }, + { + "epoch": 7.174920127795527, + "grad_norm": 0.16466212272644043, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 8983 + }, + { + "epoch": 7.175718849840256, + "grad_norm": 0.13650043308734894, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 8984 + }, + { + "epoch": 7.176517571884984, + "grad_norm": 0.05605694651603699, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 8985 + }, + { + "epoch": 7.177316293929713, + "grad_norm": 0.12153269350528717, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 8986 + }, + { + "epoch": 7.178115015974441, + "grad_norm": 0.07390844076871872, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8987 + }, + { + "epoch": 7.178913738019169, + "grad_norm": 0.05618416517972946, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 8988 + }, + { + "epoch": 7.1797124600638975, + "grad_norm": 0.24178527295589447, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 8989 + }, + { + "epoch": 7.180511182108626, + "grad_norm": 0.06414328515529633, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 8990 + }, + { + "epoch": 7.181309904153355, + "grad_norm": 0.05483662337064743, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 8991 + }, + { + "epoch": 7.182108626198083, + "grad_norm": 0.05821032077074051, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 8992 + }, + { + "epoch": 7.182907348242812, + "grad_norm": 0.04972073435783386, + "learning_rate": 0.0005, + "loss": 1.0492, + "step": 8993 + }, + { + "epoch": 7.18370607028754, + "grad_norm": 0.13323748111724854, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 8994 + }, + { + "epoch": 7.184504792332269, + "grad_norm": 0.1341763287782669, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 8995 + }, + { + "epoch": 7.185303514376997, + "grad_norm": 0.1092606782913208, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 8996 + }, + { + "epoch": 7.186102236421725, + "grad_norm": 0.10611139982938766, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 8997 + }, + { + "epoch": 7.186900958466453, + "grad_norm": 0.0810476616024971, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 8998 + }, + { + "epoch": 7.187699680511182, + "grad_norm": 0.053938958793878555, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 8999 + }, + { + "epoch": 7.18849840255591, + "grad_norm": 0.08355431258678436, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9000 + }, + { + "epoch": 7.189297124600639, + "grad_norm": 0.0719372034072876, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9001 + }, + { + "epoch": 7.1900958466453675, + "grad_norm": 0.0541183203458786, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9002 + }, + { + "epoch": 7.190894568690096, + "grad_norm": 0.08637872338294983, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9003 + }, + { + "epoch": 7.1916932907348246, + "grad_norm": 0.0900801345705986, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9004 + }, + { + "epoch": 7.192492012779553, + "grad_norm": 0.08778835088014603, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9005 + }, + { + "epoch": 7.193290734824281, + "grad_norm": 0.13946911692619324, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9006 + }, + { + "epoch": 7.194089456869009, + "grad_norm": 0.20089952647686005, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9007 + }, + { + "epoch": 7.194888178913738, + "grad_norm": 0.20472672581672668, + "learning_rate": 0.0005, + "loss": 1.0467, + "step": 9008 + }, + { + "epoch": 7.195686900958466, + "grad_norm": 0.09503829479217529, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9009 + }, + { + "epoch": 7.196485623003195, + "grad_norm": 0.057289477437734604, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9010 + }, + { + "epoch": 7.197284345047923, + "grad_norm": 0.18998531997203827, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9011 + }, + { + "epoch": 7.198083067092652, + "grad_norm": 0.12228010594844818, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9012 + }, + { + "epoch": 7.19888178913738, + "grad_norm": 0.0855637639760971, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9013 + }, + { + "epoch": 7.199680511182109, + "grad_norm": 0.08341407775878906, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9014 + }, + { + "epoch": 7.2004792332268375, + "grad_norm": 0.06806697696447372, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9015 + }, + { + "epoch": 7.201277955271565, + "grad_norm": 0.06730692833662033, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9016 + }, + { + "epoch": 7.202076677316294, + "grad_norm": 0.04983438923954964, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9017 + }, + { + "epoch": 7.202875399361022, + "grad_norm": 0.09153386205434799, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9018 + }, + { + "epoch": 7.203674121405751, + "grad_norm": 0.06117153540253639, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9019 + }, + { + "epoch": 7.204472843450479, + "grad_norm": 0.056790344417095184, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9020 + }, + { + "epoch": 7.205271565495208, + "grad_norm": 0.8241305351257324, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9021 + }, + { + "epoch": 7.206070287539936, + "grad_norm": 0.21823863685131073, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9022 + }, + { + "epoch": 7.206869009584665, + "grad_norm": 0.14799124002456665, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9023 + }, + { + "epoch": 7.207667731629393, + "grad_norm": 0.09815513342618942, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9024 + }, + { + "epoch": 7.208466453674121, + "grad_norm": 0.2076011300086975, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9025 + }, + { + "epoch": 7.2092651757188495, + "grad_norm": 0.13652865588665009, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9026 + }, + { + "epoch": 7.210063897763578, + "grad_norm": 0.15180739760398865, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9027 + }, + { + "epoch": 7.210862619808307, + "grad_norm": 0.11385779827833176, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9028 + }, + { + "epoch": 7.211661341853035, + "grad_norm": 0.05047432705760002, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9029 + }, + { + "epoch": 7.212460063897764, + "grad_norm": 0.13789398968219757, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9030 + }, + { + "epoch": 7.213258785942492, + "grad_norm": 0.10509981215000153, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9031 + }, + { + "epoch": 7.214057507987221, + "grad_norm": 0.19650724530220032, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9032 + }, + { + "epoch": 7.214856230031949, + "grad_norm": 0.11788946390151978, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9033 + }, + { + "epoch": 7.215654952076678, + "grad_norm": 0.11023712903261185, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9034 + }, + { + "epoch": 7.216453674121405, + "grad_norm": 0.3382134735584259, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9035 + }, + { + "epoch": 7.217252396166134, + "grad_norm": 0.20465348660945892, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9036 + }, + { + "epoch": 7.218051118210862, + "grad_norm": 0.17456264793872833, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9037 + }, + { + "epoch": 7.218849840255591, + "grad_norm": 0.09034306555986404, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9038 + }, + { + "epoch": 7.2196485623003195, + "grad_norm": 0.15296493470668793, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9039 + }, + { + "epoch": 7.220447284345048, + "grad_norm": 0.1379650980234146, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9040 + }, + { + "epoch": 7.2212460063897765, + "grad_norm": 0.20932430028915405, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9041 + }, + { + "epoch": 7.222044728434505, + "grad_norm": 0.09309016168117523, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9042 + }, + { + "epoch": 7.222843450479234, + "grad_norm": 0.13084891438484192, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9043 + }, + { + "epoch": 7.223642172523961, + "grad_norm": 0.1435803472995758, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9044 + }, + { + "epoch": 7.22444089456869, + "grad_norm": 0.05868425592780113, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9045 + }, + { + "epoch": 7.225239616613418, + "grad_norm": 0.09483210742473602, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9046 + }, + { + "epoch": 7.226038338658147, + "grad_norm": 0.20051591098308563, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9047 + }, + { + "epoch": 7.226837060702875, + "grad_norm": 0.09253975749015808, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9048 + }, + { + "epoch": 7.227635782747604, + "grad_norm": 0.15865609049797058, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9049 + }, + { + "epoch": 7.228434504792332, + "grad_norm": 0.14421933889389038, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9050 + }, + { + "epoch": 7.229233226837061, + "grad_norm": 0.13492006063461304, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9051 + }, + { + "epoch": 7.2300319488817895, + "grad_norm": 0.06581155210733414, + "learning_rate": 0.0005, + "loss": 1.0487, + "step": 9052 + }, + { + "epoch": 7.230830670926518, + "grad_norm": 0.12610170245170593, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9053 + }, + { + "epoch": 7.231629392971246, + "grad_norm": 0.12813681364059448, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9054 + }, + { + "epoch": 7.232428115015974, + "grad_norm": 0.07228157669305801, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9055 + }, + { + "epoch": 7.233226837060703, + "grad_norm": 0.13456740975379944, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9056 + }, + { + "epoch": 7.234025559105431, + "grad_norm": 0.10491029918193817, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9057 + }, + { + "epoch": 7.23482428115016, + "grad_norm": 0.14090387523174286, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9058 + }, + { + "epoch": 7.235623003194888, + "grad_norm": 0.10722684115171432, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9059 + }, + { + "epoch": 7.236421725239617, + "grad_norm": 0.05123287811875343, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9060 + }, + { + "epoch": 7.237220447284345, + "grad_norm": 0.1203593909740448, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9061 + }, + { + "epoch": 7.238019169329074, + "grad_norm": 0.07847320288419724, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9062 + }, + { + "epoch": 7.2388178913738015, + "grad_norm": 0.09621457010507584, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9063 + }, + { + "epoch": 7.23961661341853, + "grad_norm": 0.11915068328380585, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9064 + }, + { + "epoch": 7.2404153354632586, + "grad_norm": 0.18357326090335846, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9065 + }, + { + "epoch": 7.241214057507987, + "grad_norm": 0.06862817704677582, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9066 + }, + { + "epoch": 7.242012779552716, + "grad_norm": 0.05091634392738342, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9067 + }, + { + "epoch": 7.242811501597444, + "grad_norm": 0.09132825583219528, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9068 + }, + { + "epoch": 7.243610223642173, + "grad_norm": 0.11998780816793442, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9069 + }, + { + "epoch": 7.244408945686901, + "grad_norm": 0.0678768903017044, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9070 + }, + { + "epoch": 7.24520766773163, + "grad_norm": 0.19880260527133942, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9071 + }, + { + "epoch": 7.246006389776358, + "grad_norm": 0.06379543989896774, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9072 + }, + { + "epoch": 7.246805111821086, + "grad_norm": 0.06652764976024628, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9073 + }, + { + "epoch": 7.247603833865814, + "grad_norm": 0.10495885461568832, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9074 + }, + { + "epoch": 7.248402555910543, + "grad_norm": 0.14753985404968262, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9075 + }, + { + "epoch": 7.2492012779552715, + "grad_norm": 0.08283182233572006, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9076 + }, + { + "epoch": 7.25, + "grad_norm": 0.1378672569990158, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9077 + }, + { + "epoch": 7.2507987220447285, + "grad_norm": 0.10274125635623932, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9078 + }, + { + "epoch": 7.251597444089457, + "grad_norm": 0.09236814826726913, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9079 + }, + { + "epoch": 7.252396166134186, + "grad_norm": 0.07923156023025513, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9080 + }, + { + "epoch": 7.253194888178914, + "grad_norm": 0.2953792214393616, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9081 + }, + { + "epoch": 7.253993610223642, + "grad_norm": 9.043856620788574, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9082 + }, + { + "epoch": 7.25479233226837, + "grad_norm": 60.094329833984375, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9083 + }, + { + "epoch": 7.255591054313099, + "grad_norm": 48.363075256347656, + "learning_rate": 0.0005, + "loss": 1.0946, + "step": 9084 + }, + { + "epoch": 7.256389776357827, + "grad_norm": 92.13807678222656, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9085 + }, + { + "epoch": 7.257188498402556, + "grad_norm": 71.66429138183594, + "learning_rate": 0.0005, + "loss": 1.1524, + "step": 9086 + }, + { + "epoch": 7.257987220447284, + "grad_norm": 29.742534637451172, + "learning_rate": 0.0005, + "loss": 1.2362, + "step": 9087 + }, + { + "epoch": 7.258785942492013, + "grad_norm": 1.1841496229171753, + "learning_rate": 0.0005, + "loss": 1.4452, + "step": 9088 + }, + { + "epoch": 7.2595846645367414, + "grad_norm": 0.7909824252128601, + "learning_rate": 0.0005, + "loss": 1.3049, + "step": 9089 + }, + { + "epoch": 7.26038338658147, + "grad_norm": 0.796114444732666, + "learning_rate": 0.0005, + "loss": 1.2852, + "step": 9090 + }, + { + "epoch": 7.261182108626198, + "grad_norm": 0.9014440178871155, + "learning_rate": 0.0005, + "loss": 1.2243, + "step": 9091 + }, + { + "epoch": 7.261980830670926, + "grad_norm": 0.5654944777488708, + "learning_rate": 0.0005, + "loss": 1.1462, + "step": 9092 + }, + { + "epoch": 7.262779552715655, + "grad_norm": 1.0784763097763062, + "learning_rate": 0.0005, + "loss": 1.1468, + "step": 9093 + }, + { + "epoch": 7.263578274760383, + "grad_norm": 0.9014595150947571, + "learning_rate": 0.0005, + "loss": 1.1629, + "step": 9094 + }, + { + "epoch": 7.264376996805112, + "grad_norm": 0.4847378730773926, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9095 + }, + { + "epoch": 7.26517571884984, + "grad_norm": 0.5493710041046143, + "learning_rate": 0.0005, + "loss": 1.1205, + "step": 9096 + }, + { + "epoch": 7.265974440894569, + "grad_norm": 1.0691193342208862, + "learning_rate": 0.0005, + "loss": 1.1194, + "step": 9097 + }, + { + "epoch": 7.266773162939297, + "grad_norm": 2.062331199645996, + "learning_rate": 0.0005, + "loss": 1.1095, + "step": 9098 + }, + { + "epoch": 7.267571884984026, + "grad_norm": 2.778977632522583, + "learning_rate": 0.0005, + "loss": 1.2775, + "step": 9099 + }, + { + "epoch": 7.268370607028754, + "grad_norm": 0.8807574510574341, + "learning_rate": 0.0005, + "loss": 1.2851, + "step": 9100 + }, + { + "epoch": 7.269169329073483, + "grad_norm": 1.0370792150497437, + "learning_rate": 0.0005, + "loss": 1.1677, + "step": 9101 + }, + { + "epoch": 7.2699680511182105, + "grad_norm": 0.5272591710090637, + "learning_rate": 0.0005, + "loss": 1.1754, + "step": 9102 + }, + { + "epoch": 7.270766773162939, + "grad_norm": 0.5510113835334778, + "learning_rate": 0.0005, + "loss": 1.1686, + "step": 9103 + }, + { + "epoch": 7.271565495207668, + "grad_norm": 0.4650730490684509, + "learning_rate": 0.0005, + "loss": 1.1741, + "step": 9104 + }, + { + "epoch": 7.272364217252396, + "grad_norm": 1.071080207824707, + "learning_rate": 0.0005, + "loss": 1.1418, + "step": 9105 + }, + { + "epoch": 7.273162939297125, + "grad_norm": 0.32088524103164673, + "learning_rate": 0.0005, + "loss": 1.1304, + "step": 9106 + }, + { + "epoch": 7.273961661341853, + "grad_norm": 1.2110369205474854, + "learning_rate": 0.0005, + "loss": 1.1168, + "step": 9107 + }, + { + "epoch": 7.274760383386582, + "grad_norm": 0.8781233429908752, + "learning_rate": 0.0005, + "loss": 1.1174, + "step": 9108 + }, + { + "epoch": 7.27555910543131, + "grad_norm": 0.356841117143631, + "learning_rate": 0.0005, + "loss": 1.108, + "step": 9109 + }, + { + "epoch": 7.276357827476039, + "grad_norm": 0.41136255860328674, + "learning_rate": 0.0005, + "loss": 1.0971, + "step": 9110 + }, + { + "epoch": 7.277156549520766, + "grad_norm": 0.30638960003852844, + "learning_rate": 0.0005, + "loss": 1.1006, + "step": 9111 + }, + { + "epoch": 7.277955271565495, + "grad_norm": 0.3056134879589081, + "learning_rate": 0.0005, + "loss": 1.0907, + "step": 9112 + }, + { + "epoch": 7.2787539936102235, + "grad_norm": 0.3053964376449585, + "learning_rate": 0.0005, + "loss": 1.0803, + "step": 9113 + }, + { + "epoch": 7.279552715654952, + "grad_norm": 0.2799919843673706, + "learning_rate": 0.0005, + "loss": 1.0906, + "step": 9114 + }, + { + "epoch": 7.2803514376996805, + "grad_norm": 0.19091907143592834, + "learning_rate": 0.0005, + "loss": 1.0945, + "step": 9115 + }, + { + "epoch": 7.281150159744409, + "grad_norm": 0.19973579049110413, + "learning_rate": 0.0005, + "loss": 1.0787, + "step": 9116 + }, + { + "epoch": 7.281948881789138, + "grad_norm": 0.21867726743221283, + "learning_rate": 0.0005, + "loss": 1.0822, + "step": 9117 + }, + { + "epoch": 7.282747603833866, + "grad_norm": 0.10351689904928207, + "learning_rate": 0.0005, + "loss": 1.0755, + "step": 9118 + }, + { + "epoch": 7.283546325878595, + "grad_norm": 0.16956113278865814, + "learning_rate": 0.0005, + "loss": 1.0774, + "step": 9119 + }, + { + "epoch": 7.284345047923322, + "grad_norm": 0.2959003150463104, + "learning_rate": 0.0005, + "loss": 1.0674, + "step": 9120 + }, + { + "epoch": 7.285143769968051, + "grad_norm": 0.18194587528705597, + "learning_rate": 0.0005, + "loss": 1.074, + "step": 9121 + }, + { + "epoch": 7.285942492012779, + "grad_norm": 0.10713140666484833, + "learning_rate": 0.0005, + "loss": 1.0698, + "step": 9122 + }, + { + "epoch": 7.286741214057508, + "grad_norm": 0.2391309142112732, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9123 + }, + { + "epoch": 7.287539936102236, + "grad_norm": 0.25640085339546204, + "learning_rate": 0.0005, + "loss": 1.0739, + "step": 9124 + }, + { + "epoch": 7.288338658146965, + "grad_norm": 0.25697845220565796, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9125 + }, + { + "epoch": 7.289137380191693, + "grad_norm": 0.2679392695426941, + "learning_rate": 0.0005, + "loss": 1.081, + "step": 9126 + }, + { + "epoch": 7.289936102236422, + "grad_norm": 0.3405737280845642, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9127 + }, + { + "epoch": 7.2907348242811505, + "grad_norm": 0.31081417202949524, + "learning_rate": 0.0005, + "loss": 1.0688, + "step": 9128 + }, + { + "epoch": 7.291533546325878, + "grad_norm": 0.15159590542316437, + "learning_rate": 0.0005, + "loss": 1.0771, + "step": 9129 + }, + { + "epoch": 7.292332268370607, + "grad_norm": 1.1609382629394531, + "learning_rate": 0.0005, + "loss": 1.0723, + "step": 9130 + }, + { + "epoch": 7.293130990415335, + "grad_norm": 0.5588571429252625, + "learning_rate": 0.0005, + "loss": 1.0884, + "step": 9131 + }, + { + "epoch": 7.293929712460064, + "grad_norm": 0.47076234221458435, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9132 + }, + { + "epoch": 7.294728434504792, + "grad_norm": 1.184756875038147, + "learning_rate": 0.0005, + "loss": 1.0833, + "step": 9133 + }, + { + "epoch": 7.295527156549521, + "grad_norm": 0.40956422686576843, + "learning_rate": 0.0005, + "loss": 1.0789, + "step": 9134 + }, + { + "epoch": 7.296325878594249, + "grad_norm": 0.8017024397850037, + "learning_rate": 0.0005, + "loss": 1.1051, + "step": 9135 + }, + { + "epoch": 7.297124600638978, + "grad_norm": 0.29993146657943726, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9136 + }, + { + "epoch": 7.297923322683706, + "grad_norm": 0.4549245238304138, + "learning_rate": 0.0005, + "loss": 1.0896, + "step": 9137 + }, + { + "epoch": 7.298722044728435, + "grad_norm": 0.26366063952445984, + "learning_rate": 0.0005, + "loss": 1.0772, + "step": 9138 + }, + { + "epoch": 7.2995207667731625, + "grad_norm": 0.3126361668109894, + "learning_rate": 0.0005, + "loss": 1.0717, + "step": 9139 + }, + { + "epoch": 7.300319488817891, + "grad_norm": 0.18184784054756165, + "learning_rate": 0.0005, + "loss": 1.0754, + "step": 9140 + }, + { + "epoch": 7.30111821086262, + "grad_norm": 0.91683429479599, + "learning_rate": 0.0005, + "loss": 1.0802, + "step": 9141 + }, + { + "epoch": 7.301916932907348, + "grad_norm": 3.3384642601013184, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9142 + }, + { + "epoch": 7.302715654952077, + "grad_norm": 0.21734145283699036, + "learning_rate": 0.0005, + "loss": 1.0768, + "step": 9143 + }, + { + "epoch": 7.303514376996805, + "grad_norm": 0.13850291073322296, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9144 + }, + { + "epoch": 7.304313099041534, + "grad_norm": 0.1737629920244217, + "learning_rate": 0.0005, + "loss": 1.0782, + "step": 9145 + }, + { + "epoch": 7.305111821086262, + "grad_norm": 0.3947316110134125, + "learning_rate": 0.0005, + "loss": 1.0794, + "step": 9146 + }, + { + "epoch": 7.305910543130991, + "grad_norm": 0.16360799968242645, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9147 + }, + { + "epoch": 7.306709265175719, + "grad_norm": 0.14816711843013763, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9148 + }, + { + "epoch": 7.307507987220447, + "grad_norm": 0.13554179668426514, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9149 + }, + { + "epoch": 7.3083067092651754, + "grad_norm": 0.10308978706598282, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9150 + }, + { + "epoch": 7.309105431309904, + "grad_norm": 0.11216582357883453, + "learning_rate": 0.0005, + "loss": 1.0678, + "step": 9151 + }, + { + "epoch": 7.3099041533546325, + "grad_norm": 0.08531700819730759, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9152 + }, + { + "epoch": 7.310702875399361, + "grad_norm": 0.10261841118335724, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9153 + }, + { + "epoch": 7.31150159744409, + "grad_norm": 0.18318074941635132, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9154 + }, + { + "epoch": 7.312300319488818, + "grad_norm": 0.1616939902305603, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9155 + }, + { + "epoch": 7.313099041533547, + "grad_norm": 0.10412739217281342, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9156 + }, + { + "epoch": 7.313897763578275, + "grad_norm": 0.14097075164318085, + "learning_rate": 0.0005, + "loss": 1.0681, + "step": 9157 + }, + { + "epoch": 7.314696485623003, + "grad_norm": 0.2168329358100891, + "learning_rate": 0.0005, + "loss": 1.0689, + "step": 9158 + }, + { + "epoch": 7.315495207667731, + "grad_norm": 0.14337286353111267, + "learning_rate": 0.0005, + "loss": 1.0707, + "step": 9159 + }, + { + "epoch": 7.31629392971246, + "grad_norm": 0.10328586399555206, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9160 + }, + { + "epoch": 7.317092651757188, + "grad_norm": 0.15820610523223877, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9161 + }, + { + "epoch": 7.317891373801917, + "grad_norm": 0.11771009862422943, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9162 + }, + { + "epoch": 7.318690095846645, + "grad_norm": 0.06801208108663559, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9163 + }, + { + "epoch": 7.319488817891374, + "grad_norm": 0.08691044896841049, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9164 + }, + { + "epoch": 7.3202875399361025, + "grad_norm": 0.10149878263473511, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9165 + }, + { + "epoch": 7.321086261980831, + "grad_norm": 0.08544973284006119, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9166 + }, + { + "epoch": 7.321884984025559, + "grad_norm": 0.21312831342220306, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9167 + }, + { + "epoch": 7.322683706070287, + "grad_norm": 0.09866507351398468, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9168 + }, + { + "epoch": 7.323482428115016, + "grad_norm": 0.09676753729581833, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9169 + }, + { + "epoch": 7.324281150159744, + "grad_norm": 0.1783452033996582, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9170 + }, + { + "epoch": 7.325079872204473, + "grad_norm": 0.16399280726909637, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9171 + }, + { + "epoch": 7.325878594249201, + "grad_norm": 0.1160425990819931, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9172 + }, + { + "epoch": 7.32667731629393, + "grad_norm": 0.09826952964067459, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9173 + }, + { + "epoch": 7.327476038338658, + "grad_norm": 0.1292516440153122, + "learning_rate": 0.0005, + "loss": 1.0516, + "step": 9174 + }, + { + "epoch": 7.328274760383387, + "grad_norm": 0.1253383606672287, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9175 + }, + { + "epoch": 7.329073482428115, + "grad_norm": 0.15330855548381805, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9176 + }, + { + "epoch": 7.329872204472843, + "grad_norm": 0.16339725255966187, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9177 + }, + { + "epoch": 7.330670926517572, + "grad_norm": 0.1716328263282776, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9178 + }, + { + "epoch": 7.3314696485623, + "grad_norm": 0.07669667154550552, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9179 + }, + { + "epoch": 7.332268370607029, + "grad_norm": 0.06626272946596146, + "learning_rate": 0.0005, + "loss": 1.0682, + "step": 9180 + }, + { + "epoch": 7.333067092651757, + "grad_norm": 0.0935940146446228, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9181 + }, + { + "epoch": 7.333865814696486, + "grad_norm": 0.07840511202812195, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9182 + }, + { + "epoch": 7.334664536741214, + "grad_norm": 0.07776588946580887, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9183 + }, + { + "epoch": 7.335463258785943, + "grad_norm": 0.084624283015728, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9184 + }, + { + "epoch": 7.336261980830671, + "grad_norm": 0.07562167197465897, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9185 + }, + { + "epoch": 7.3370607028754, + "grad_norm": 0.08628194034099579, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9186 + }, + { + "epoch": 7.337859424920127, + "grad_norm": 0.0654950812458992, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9187 + }, + { + "epoch": 7.338658146964856, + "grad_norm": 0.06403883546590805, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9188 + }, + { + "epoch": 7.3394568690095845, + "grad_norm": 0.8679103851318359, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9189 + }, + { + "epoch": 7.340255591054313, + "grad_norm": 0.42257770895957947, + "learning_rate": 0.0005, + "loss": 1.0784, + "step": 9190 + }, + { + "epoch": 7.3410543130990416, + "grad_norm": 0.3017493486404419, + "learning_rate": 0.0005, + "loss": 1.0758, + "step": 9191 + }, + { + "epoch": 7.34185303514377, + "grad_norm": 0.30509164929389954, + "learning_rate": 0.0005, + "loss": 1.0725, + "step": 9192 + }, + { + "epoch": 7.342651757188499, + "grad_norm": 0.28457221388816833, + "learning_rate": 0.0005, + "loss": 1.0769, + "step": 9193 + }, + { + "epoch": 7.343450479233227, + "grad_norm": 0.2734214961528778, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9194 + }, + { + "epoch": 7.344249201277956, + "grad_norm": 0.2931375801563263, + "learning_rate": 0.0005, + "loss": 1.0715, + "step": 9195 + }, + { + "epoch": 7.345047923322683, + "grad_norm": 0.11534975469112396, + "learning_rate": 0.0005, + "loss": 1.0691, + "step": 9196 + }, + { + "epoch": 7.345846645367412, + "grad_norm": 0.1489555388689041, + "learning_rate": 0.0005, + "loss": 1.0705, + "step": 9197 + }, + { + "epoch": 7.34664536741214, + "grad_norm": 0.13024470210075378, + "learning_rate": 0.0005, + "loss": 1.0726, + "step": 9198 + }, + { + "epoch": 7.347444089456869, + "grad_norm": 0.1413331776857376, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9199 + }, + { + "epoch": 7.348242811501597, + "grad_norm": 0.07862340658903122, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9200 + }, + { + "epoch": 7.349041533546326, + "grad_norm": 0.0870542973279953, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9201 + }, + { + "epoch": 7.3498402555910545, + "grad_norm": 0.07556174695491791, + "learning_rate": 0.0005, + "loss": 1.0659, + "step": 9202 + }, + { + "epoch": 7.350638977635783, + "grad_norm": 0.07381146401166916, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9203 + }, + { + "epoch": 7.3514376996805115, + "grad_norm": 0.5006929636001587, + "learning_rate": 0.0005, + "loss": 1.0672, + "step": 9204 + }, + { + "epoch": 7.352236421725239, + "grad_norm": 0.2980809807777405, + "learning_rate": 0.0005, + "loss": 1.0733, + "step": 9205 + }, + { + "epoch": 7.353035143769968, + "grad_norm": 0.20632435381412506, + "learning_rate": 0.0005, + "loss": 1.0701, + "step": 9206 + }, + { + "epoch": 7.353833865814696, + "grad_norm": 0.2028435915708542, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9207 + }, + { + "epoch": 7.354632587859425, + "grad_norm": 0.220264732837677, + "learning_rate": 0.0005, + "loss": 1.0664, + "step": 9208 + }, + { + "epoch": 7.355431309904153, + "grad_norm": 0.07175029814243317, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9209 + }, + { + "epoch": 7.356230031948882, + "grad_norm": 0.20052626729011536, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9210 + }, + { + "epoch": 7.35702875399361, + "grad_norm": 0.3549690544605255, + "learning_rate": 0.0005, + "loss": 1.0697, + "step": 9211 + }, + { + "epoch": 7.357827476038339, + "grad_norm": 0.1310572475194931, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9212 + }, + { + "epoch": 7.358626198083067, + "grad_norm": 0.9551740288734436, + "learning_rate": 0.0005, + "loss": 1.0747, + "step": 9213 + }, + { + "epoch": 7.359424920127796, + "grad_norm": 0.13663409650325775, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9214 + }, + { + "epoch": 7.360223642172524, + "grad_norm": 0.11436715722084045, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9215 + }, + { + "epoch": 7.361022364217252, + "grad_norm": 0.10911283642053604, + "learning_rate": 0.0005, + "loss": 1.0687, + "step": 9216 + }, + { + "epoch": 7.361821086261981, + "grad_norm": 0.11186671257019043, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9217 + }, + { + "epoch": 7.362619808306709, + "grad_norm": 0.1308698207139969, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9218 + }, + { + "epoch": 7.363418530351438, + "grad_norm": 0.07584013044834137, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9219 + }, + { + "epoch": 7.364217252396166, + "grad_norm": 0.07789483666419983, + "learning_rate": 0.0005, + "loss": 1.0696, + "step": 9220 + }, + { + "epoch": 7.365015974440895, + "grad_norm": 0.12758736312389374, + "learning_rate": 0.0005, + "loss": 1.0651, + "step": 9221 + }, + { + "epoch": 7.365814696485623, + "grad_norm": 0.09310994297266006, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9222 + }, + { + "epoch": 7.366613418530352, + "grad_norm": 0.14761847257614136, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9223 + }, + { + "epoch": 7.36741214057508, + "grad_norm": 0.8784921169281006, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9224 + }, + { + "epoch": 7.368210862619808, + "grad_norm": 0.07754036784172058, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9225 + }, + { + "epoch": 7.3690095846645365, + "grad_norm": 0.06706640869379044, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9226 + }, + { + "epoch": 7.369808306709265, + "grad_norm": 0.0949360579252243, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9227 + }, + { + "epoch": 7.3706070287539935, + "grad_norm": 0.09635552763938904, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9228 + }, + { + "epoch": 7.371405750798722, + "grad_norm": 0.15888135135173798, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9229 + }, + { + "epoch": 7.372204472843451, + "grad_norm": 0.1487814337015152, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9230 + }, + { + "epoch": 7.373003194888179, + "grad_norm": 0.09755469113588333, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9231 + }, + { + "epoch": 7.373801916932908, + "grad_norm": 0.2550356984138489, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9232 + }, + { + "epoch": 7.374600638977636, + "grad_norm": 0.13796621561050415, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9233 + }, + { + "epoch": 7.375399361022364, + "grad_norm": 0.06727192550897598, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9234 + }, + { + "epoch": 7.376198083067092, + "grad_norm": 0.09111928194761276, + "learning_rate": 0.0005, + "loss": 1.0669, + "step": 9235 + }, + { + "epoch": 7.376996805111821, + "grad_norm": 0.15708492696285248, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9236 + }, + { + "epoch": 7.377795527156549, + "grad_norm": 0.06607159227132797, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9237 + }, + { + "epoch": 7.378594249201278, + "grad_norm": 0.3495469391345978, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9238 + }, + { + "epoch": 7.3793929712460065, + "grad_norm": 0.249598890542984, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9239 + }, + { + "epoch": 7.380191693290735, + "grad_norm": 0.1506706029176712, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9240 + }, + { + "epoch": 7.3809904153354635, + "grad_norm": 0.2053573578596115, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9241 + }, + { + "epoch": 7.381789137380192, + "grad_norm": 0.20234468579292297, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9242 + }, + { + "epoch": 7.38258785942492, + "grad_norm": 0.23514828085899353, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9243 + }, + { + "epoch": 7.383386581469648, + "grad_norm": 0.13418453931808472, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9244 + }, + { + "epoch": 7.384185303514377, + "grad_norm": 0.07703951746225357, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9245 + }, + { + "epoch": 7.384984025559105, + "grad_norm": 0.20256030559539795, + "learning_rate": 0.0005, + "loss": 1.064, + "step": 9246 + }, + { + "epoch": 7.385782747603834, + "grad_norm": 0.1140165850520134, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9247 + }, + { + "epoch": 7.386581469648562, + "grad_norm": 0.6283542513847351, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9248 + }, + { + "epoch": 7.387380191693291, + "grad_norm": 0.11779789626598358, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9249 + }, + { + "epoch": 7.388178913738019, + "grad_norm": 0.09821031987667084, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9250 + }, + { + "epoch": 7.388977635782748, + "grad_norm": 0.10942906141281128, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9251 + }, + { + "epoch": 7.389776357827476, + "grad_norm": 0.6150240302085876, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9252 + }, + { + "epoch": 7.390575079872204, + "grad_norm": 0.17758208513259888, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9253 + }, + { + "epoch": 7.391373801916933, + "grad_norm": 0.09567593038082123, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9254 + }, + { + "epoch": 7.392172523961661, + "grad_norm": 0.1177724078297615, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9255 + }, + { + "epoch": 7.39297124600639, + "grad_norm": 0.12369771301746368, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9256 + }, + { + "epoch": 7.393769968051118, + "grad_norm": 0.11247415840625763, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9257 + }, + { + "epoch": 7.394568690095847, + "grad_norm": 0.15094342827796936, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9258 + }, + { + "epoch": 7.395367412140575, + "grad_norm": 0.113029845058918, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9259 + }, + { + "epoch": 7.396166134185304, + "grad_norm": 0.1620573252439499, + "learning_rate": 0.0005, + "loss": 1.0654, + "step": 9260 + }, + { + "epoch": 7.396964856230032, + "grad_norm": 0.10010898113250732, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9261 + }, + { + "epoch": 7.397763578274761, + "grad_norm": 0.21061348915100098, + "learning_rate": 0.0005, + "loss": 1.0644, + "step": 9262 + }, + { + "epoch": 7.3985623003194885, + "grad_norm": 0.06199006363749504, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9263 + }, + { + "epoch": 7.399361022364217, + "grad_norm": 0.09612002968788147, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9264 + }, + { + "epoch": 7.4001597444089455, + "grad_norm": 0.13255780935287476, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9265 + }, + { + "epoch": 7.400958466453674, + "grad_norm": 0.22877056896686554, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9266 + }, + { + "epoch": 7.401757188498403, + "grad_norm": 0.18957512080669403, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9267 + }, + { + "epoch": 7.402555910543131, + "grad_norm": 0.211961030960083, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9268 + }, + { + "epoch": 7.40335463258786, + "grad_norm": 0.07744339853525162, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9269 + }, + { + "epoch": 7.404153354632588, + "grad_norm": 0.19085711240768433, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9270 + }, + { + "epoch": 7.404952076677317, + "grad_norm": 0.13099227845668793, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9271 + }, + { + "epoch": 7.405750798722044, + "grad_norm": 0.24543818831443787, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9272 + }, + { + "epoch": 7.406549520766773, + "grad_norm": 0.18623757362365723, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9273 + }, + { + "epoch": 7.407348242811501, + "grad_norm": 0.06898430734872818, + "learning_rate": 0.0005, + "loss": 1.0617, + "step": 9274 + }, + { + "epoch": 7.40814696485623, + "grad_norm": 0.1809006780385971, + "learning_rate": 0.0005, + "loss": 1.0597, + "step": 9275 + }, + { + "epoch": 7.4089456869009584, + "grad_norm": 0.11338596791028976, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9276 + }, + { + "epoch": 7.409744408945687, + "grad_norm": 0.10182031989097595, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9277 + }, + { + "epoch": 7.4105431309904155, + "grad_norm": 0.1521865278482437, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9278 + }, + { + "epoch": 7.411341853035144, + "grad_norm": 0.08848808705806732, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9279 + }, + { + "epoch": 7.412140575079873, + "grad_norm": 0.10398431867361069, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9280 + }, + { + "epoch": 7.4129392971246, + "grad_norm": 0.10145912319421768, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9281 + }, + { + "epoch": 7.413738019169329, + "grad_norm": 0.12386789917945862, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9282 + }, + { + "epoch": 7.414536741214057, + "grad_norm": 0.09763981401920319, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9283 + }, + { + "epoch": 7.415335463258786, + "grad_norm": 0.08810468763113022, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9284 + }, + { + "epoch": 7.416134185303514, + "grad_norm": 0.06196752190589905, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9285 + }, + { + "epoch": 7.416932907348243, + "grad_norm": 1.4297560453414917, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9286 + }, + { + "epoch": 7.417731629392971, + "grad_norm": 0.07783587276935577, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9287 + }, + { + "epoch": 7.4185303514377, + "grad_norm": 0.3592485189437866, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9288 + }, + { + "epoch": 7.419329073482428, + "grad_norm": 0.10796934366226196, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9289 + }, + { + "epoch": 7.420127795527157, + "grad_norm": 0.11450864374637604, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9290 + }, + { + "epoch": 7.420926517571885, + "grad_norm": 0.06718776375055313, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9291 + }, + { + "epoch": 7.421725239616613, + "grad_norm": 0.1776629537343979, + "learning_rate": 0.0005, + "loss": 1.0551, + "step": 9292 + }, + { + "epoch": 7.422523961661342, + "grad_norm": 0.058177318423986435, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9293 + }, + { + "epoch": 7.42332268370607, + "grad_norm": 0.08145572990179062, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9294 + }, + { + "epoch": 7.424121405750799, + "grad_norm": 0.07605774700641632, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9295 + }, + { + "epoch": 7.424920127795527, + "grad_norm": 0.5453565120697021, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9296 + }, + { + "epoch": 7.425718849840256, + "grad_norm": 0.08215200155973434, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9297 + }, + { + "epoch": 7.426517571884984, + "grad_norm": 0.06014016270637512, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9298 + }, + { + "epoch": 7.427316293929713, + "grad_norm": 0.11043576151132584, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9299 + }, + { + "epoch": 7.428115015974441, + "grad_norm": 0.1421220898628235, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9300 + }, + { + "epoch": 7.428913738019169, + "grad_norm": 0.10473544150590897, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9301 + }, + { + "epoch": 7.4297124600638975, + "grad_norm": 0.09921323508024216, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9302 + }, + { + "epoch": 7.430511182108626, + "grad_norm": 0.07775744050741196, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9303 + }, + { + "epoch": 7.431309904153355, + "grad_norm": 0.3015517294406891, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9304 + }, + { + "epoch": 7.432108626198083, + "grad_norm": 0.06826018542051315, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9305 + }, + { + "epoch": 7.432907348242812, + "grad_norm": 0.06002574786543846, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9306 + }, + { + "epoch": 7.43370607028754, + "grad_norm": 0.07082310318946838, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9307 + }, + { + "epoch": 7.434504792332269, + "grad_norm": 0.1356203258037567, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9308 + }, + { + "epoch": 7.435303514376997, + "grad_norm": 0.09689080715179443, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9309 + }, + { + "epoch": 7.436102236421725, + "grad_norm": 0.0938429981470108, + "learning_rate": 0.0005, + "loss": 1.0443, + "step": 9310 + }, + { + "epoch": 7.436900958466453, + "grad_norm": 0.0853746086359024, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9311 + }, + { + "epoch": 7.437699680511182, + "grad_norm": 0.09427982568740845, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9312 + }, + { + "epoch": 7.43849840255591, + "grad_norm": 0.14042942225933075, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9313 + }, + { + "epoch": 7.439297124600639, + "grad_norm": 0.4248291552066803, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9314 + }, + { + "epoch": 7.4400958466453675, + "grad_norm": 0.18214350938796997, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9315 + }, + { + "epoch": 7.440894568690096, + "grad_norm": 0.2564402222633362, + "learning_rate": 0.0005, + "loss": 1.0488, + "step": 9316 + }, + { + "epoch": 7.4416932907348246, + "grad_norm": 0.10012423992156982, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9317 + }, + { + "epoch": 7.442492012779553, + "grad_norm": 0.15337461233139038, + "learning_rate": 0.0005, + "loss": 1.0653, + "step": 9318 + }, + { + "epoch": 7.443290734824281, + "grad_norm": 0.1396649181842804, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9319 + }, + { + "epoch": 7.444089456869009, + "grad_norm": 0.12310001254081726, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9320 + }, + { + "epoch": 7.444888178913738, + "grad_norm": 0.12932278215885162, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9321 + }, + { + "epoch": 7.445686900958466, + "grad_norm": 0.12403959035873413, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9322 + }, + { + "epoch": 7.446485623003195, + "grad_norm": 0.4164578318595886, + "learning_rate": 0.0005, + "loss": 1.0585, + "step": 9323 + }, + { + "epoch": 7.447284345047923, + "grad_norm": 0.2015235871076584, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 9324 + }, + { + "epoch": 7.448083067092652, + "grad_norm": 0.2619101107120514, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9325 + }, + { + "epoch": 7.44888178913738, + "grad_norm": 0.07511210441589355, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9326 + }, + { + "epoch": 7.449680511182109, + "grad_norm": 7.956277370452881, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9327 + }, + { + "epoch": 7.4504792332268375, + "grad_norm": 0.23822273313999176, + "learning_rate": 0.0005, + "loss": 1.0718, + "step": 9328 + }, + { + "epoch": 7.451277955271565, + "grad_norm": 0.1565391719341278, + "learning_rate": 0.0005, + "loss": 1.078, + "step": 9329 + }, + { + "epoch": 7.452076677316294, + "grad_norm": 0.15820777416229248, + "learning_rate": 0.0005, + "loss": 1.0734, + "step": 9330 + }, + { + "epoch": 7.452875399361022, + "grad_norm": 0.16341058909893036, + "learning_rate": 0.0005, + "loss": 1.0745, + "step": 9331 + }, + { + "epoch": 7.453674121405751, + "grad_norm": 0.19414658844470978, + "learning_rate": 0.0005, + "loss": 1.0748, + "step": 9332 + }, + { + "epoch": 7.454472843450479, + "grad_norm": 0.18798880279064178, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9333 + }, + { + "epoch": 7.455271565495208, + "grad_norm": 0.09032963961362839, + "learning_rate": 0.0005, + "loss": 1.0645, + "step": 9334 + }, + { + "epoch": 7.456070287539936, + "grad_norm": 0.12746790051460266, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9335 + }, + { + "epoch": 7.456869009584665, + "grad_norm": 0.34985360503196716, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9336 + }, + { + "epoch": 7.457667731629393, + "grad_norm": 0.22745627164840698, + "learning_rate": 0.0005, + "loss": 1.0695, + "step": 9337 + }, + { + "epoch": 7.458466453674122, + "grad_norm": 1.297531247138977, + "learning_rate": 0.0005, + "loss": 1.0706, + "step": 9338 + }, + { + "epoch": 7.4592651757188495, + "grad_norm": 0.3254985809326172, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9339 + }, + { + "epoch": 7.460063897763578, + "grad_norm": 0.28899863362312317, + "learning_rate": 0.0005, + "loss": 1.0713, + "step": 9340 + }, + { + "epoch": 7.460862619808307, + "grad_norm": 0.09964017570018768, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9341 + }, + { + "epoch": 7.461661341853035, + "grad_norm": 0.2713227868080139, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9342 + }, + { + "epoch": 7.462460063897764, + "grad_norm": 0.16604198515415192, + "learning_rate": 0.0005, + "loss": 1.0684, + "step": 9343 + }, + { + "epoch": 7.463258785942492, + "grad_norm": 0.12053536623716354, + "learning_rate": 0.0005, + "loss": 1.0759, + "step": 9344 + }, + { + "epoch": 7.464057507987221, + "grad_norm": 0.20081757009029388, + "learning_rate": 0.0005, + "loss": 1.0662, + "step": 9345 + }, + { + "epoch": 7.464856230031949, + "grad_norm": 0.14005789160728455, + "learning_rate": 0.0005, + "loss": 1.066, + "step": 9346 + }, + { + "epoch": 7.465654952076678, + "grad_norm": 0.15481705963611603, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9347 + }, + { + "epoch": 7.466453674121405, + "grad_norm": 0.1843721717596054, + "learning_rate": 0.0005, + "loss": 1.0612, + "step": 9348 + }, + { + "epoch": 7.467252396166134, + "grad_norm": 0.11873828619718552, + "learning_rate": 0.0005, + "loss": 1.0648, + "step": 9349 + }, + { + "epoch": 7.468051118210862, + "grad_norm": 0.199008509516716, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9350 + }, + { + "epoch": 7.468849840255591, + "grad_norm": 0.10533998161554337, + "learning_rate": 0.0005, + "loss": 1.0649, + "step": 9351 + }, + { + "epoch": 7.4696485623003195, + "grad_norm": 0.4823262691497803, + "learning_rate": 0.0005, + "loss": 1.0663, + "step": 9352 + }, + { + "epoch": 7.470447284345048, + "grad_norm": 0.25044289231300354, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9353 + }, + { + "epoch": 7.4712460063897765, + "grad_norm": 0.11273030936717987, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9354 + }, + { + "epoch": 7.472044728434505, + "grad_norm": 0.15552200376987457, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9355 + }, + { + "epoch": 7.472843450479234, + "grad_norm": 0.2211492508649826, + "learning_rate": 0.0005, + "loss": 1.067, + "step": 9356 + }, + { + "epoch": 7.473642172523961, + "grad_norm": 0.38023853302001953, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9357 + }, + { + "epoch": 7.47444089456869, + "grad_norm": 0.15553027391433716, + "learning_rate": 0.0005, + "loss": 1.0636, + "step": 9358 + }, + { + "epoch": 7.475239616613418, + "grad_norm": 0.11964324861764908, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9359 + }, + { + "epoch": 7.476038338658147, + "grad_norm": 0.06454652547836304, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9360 + }, + { + "epoch": 7.476837060702875, + "grad_norm": 0.090255506336689, + "learning_rate": 0.0005, + "loss": 1.063, + "step": 9361 + }, + { + "epoch": 7.477635782747604, + "grad_norm": 0.07100088149309158, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9362 + }, + { + "epoch": 7.478434504792332, + "grad_norm": 0.14697550237178802, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9363 + }, + { + "epoch": 7.479233226837061, + "grad_norm": 0.14088693261146545, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9364 + }, + { + "epoch": 7.4800319488817895, + "grad_norm": 0.12696029245853424, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9365 + }, + { + "epoch": 7.480830670926517, + "grad_norm": 0.15335378050804138, + "learning_rate": 0.0005, + "loss": 1.0656, + "step": 9366 + }, + { + "epoch": 7.481629392971246, + "grad_norm": 0.10186830163002014, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9367 + }, + { + "epoch": 7.482428115015974, + "grad_norm": 0.11318683624267578, + "learning_rate": 0.0005, + "loss": 1.0531, + "step": 9368 + }, + { + "epoch": 7.483226837060703, + "grad_norm": 0.1290084272623062, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9369 + }, + { + "epoch": 7.484025559105431, + "grad_norm": 0.160775288939476, + "learning_rate": 0.0005, + "loss": 1.0655, + "step": 9370 + }, + { + "epoch": 7.48482428115016, + "grad_norm": 0.1998366117477417, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9371 + }, + { + "epoch": 7.485623003194888, + "grad_norm": 0.15808500349521637, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9372 + }, + { + "epoch": 7.486421725239617, + "grad_norm": 0.15403985977172852, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9373 + }, + { + "epoch": 7.487220447284345, + "grad_norm": 0.11963094770908356, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9374 + }, + { + "epoch": 7.488019169329074, + "grad_norm": 0.058245617896318436, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9375 + }, + { + "epoch": 7.488817891373802, + "grad_norm": 0.1256275773048401, + "learning_rate": 0.0005, + "loss": 1.0603, + "step": 9376 + }, + { + "epoch": 7.48961661341853, + "grad_norm": 0.09230747818946838, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9377 + }, + { + "epoch": 7.4904153354632586, + "grad_norm": 0.15109197795391083, + "learning_rate": 0.0005, + "loss": 1.0638, + "step": 9378 + }, + { + "epoch": 7.491214057507987, + "grad_norm": 0.20005039870738983, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9379 + }, + { + "epoch": 7.492012779552716, + "grad_norm": 0.08591387420892715, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9380 + }, + { + "epoch": 7.492811501597444, + "grad_norm": 0.07975071668624878, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9381 + }, + { + "epoch": 7.493610223642173, + "grad_norm": 0.1258707046508789, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9382 + }, + { + "epoch": 7.494408945686901, + "grad_norm": 0.16978499293327332, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9383 + }, + { + "epoch": 7.49520766773163, + "grad_norm": 0.09052985906600952, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9384 + }, + { + "epoch": 7.496006389776358, + "grad_norm": 0.15344351530075073, + "learning_rate": 0.0005, + "loss": 1.0489, + "step": 9385 + }, + { + "epoch": 7.496805111821086, + "grad_norm": 0.04684900864958763, + "learning_rate": 0.0005, + "loss": 1.0592, + "step": 9386 + }, + { + "epoch": 7.497603833865814, + "grad_norm": 0.09235356748104095, + "learning_rate": 0.0005, + "loss": 1.0602, + "step": 9387 + }, + { + "epoch": 7.498402555910543, + "grad_norm": 0.0924983024597168, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9388 + }, + { + "epoch": 7.4992012779552715, + "grad_norm": 0.12623359262943268, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9389 + }, + { + "epoch": 7.5, + "grad_norm": 0.08572034537792206, + "learning_rate": 0.0005, + "loss": 1.056, + "step": 9390 + }, + { + "epoch": 7.5007987220447285, + "grad_norm": 0.12267094850540161, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9391 + }, + { + "epoch": 7.501597444089457, + "grad_norm": 0.20448675751686096, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9392 + }, + { + "epoch": 7.502396166134186, + "grad_norm": 0.21579930186271667, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9393 + }, + { + "epoch": 7.503194888178914, + "grad_norm": 0.22682903707027435, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9394 + }, + { + "epoch": 7.503993610223642, + "grad_norm": 0.08659582585096359, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 9395 + }, + { + "epoch": 7.50479233226837, + "grad_norm": 0.2064916491508484, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9396 + }, + { + "epoch": 7.505591054313099, + "grad_norm": 0.2137736678123474, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9397 + }, + { + "epoch": 7.506389776357827, + "grad_norm": 0.10891635715961456, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9398 + }, + { + "epoch": 7.507188498402556, + "grad_norm": 0.23018239438533783, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9399 + }, + { + "epoch": 7.507987220447284, + "grad_norm": 0.2091149538755417, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9400 + }, + { + "epoch": 7.508785942492013, + "grad_norm": 0.11136184632778168, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9401 + }, + { + "epoch": 7.5095846645367414, + "grad_norm": 0.1327456831932068, + "learning_rate": 0.0005, + "loss": 1.0491, + "step": 9402 + }, + { + "epoch": 7.51038338658147, + "grad_norm": 0.08780363947153091, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9403 + }, + { + "epoch": 7.511182108626198, + "grad_norm": 0.14448396861553192, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9404 + }, + { + "epoch": 7.511980830670926, + "grad_norm": 0.12194132804870605, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9405 + }, + { + "epoch": 7.512779552715655, + "grad_norm": 0.09898994117975235, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9406 + }, + { + "epoch": 7.513578274760383, + "grad_norm": 0.0753403753042221, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9407 + }, + { + "epoch": 7.514376996805112, + "grad_norm": 0.1947120577096939, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9408 + }, + { + "epoch": 7.51517571884984, + "grad_norm": 0.10827653110027313, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9409 + }, + { + "epoch": 7.515974440894569, + "grad_norm": 0.06353825330734253, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9410 + }, + { + "epoch": 7.516773162939297, + "grad_norm": 0.16961680352687836, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9411 + }, + { + "epoch": 7.517571884984026, + "grad_norm": 0.09001661092042923, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9412 + }, + { + "epoch": 7.518370607028754, + "grad_norm": 0.07342718541622162, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9413 + }, + { + "epoch": 7.519169329073483, + "grad_norm": 0.1001489907503128, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9414 + }, + { + "epoch": 7.5199680511182105, + "grad_norm": 0.10038813948631287, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9415 + }, + { + "epoch": 7.520766773162939, + "grad_norm": 0.17261064052581787, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9416 + }, + { + "epoch": 7.521565495207668, + "grad_norm": 0.10589580982923508, + "learning_rate": 0.0005, + "loss": 1.0575, + "step": 9417 + }, + { + "epoch": 7.522364217252396, + "grad_norm": 0.055702172219753265, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9418 + }, + { + "epoch": 7.523162939297125, + "grad_norm": 0.122915118932724, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9419 + }, + { + "epoch": 7.523961661341853, + "grad_norm": 0.07361354678869247, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9420 + }, + { + "epoch": 7.524760383386582, + "grad_norm": 0.11187693476676941, + "learning_rate": 0.0005, + "loss": 1.053, + "step": 9421 + }, + { + "epoch": 7.52555910543131, + "grad_norm": 0.06205413118004799, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9422 + }, + { + "epoch": 7.526357827476039, + "grad_norm": 0.07805868983268738, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9423 + }, + { + "epoch": 7.527156549520766, + "grad_norm": 0.14349821209907532, + "learning_rate": 0.0005, + "loss": 1.0595, + "step": 9424 + }, + { + "epoch": 7.527955271565495, + "grad_norm": 0.08928489685058594, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9425 + }, + { + "epoch": 7.5287539936102235, + "grad_norm": 0.10026145726442337, + "learning_rate": 0.0005, + "loss": 1.0464, + "step": 9426 + }, + { + "epoch": 7.529552715654952, + "grad_norm": 0.10531286150217056, + "learning_rate": 0.0005, + "loss": 1.0661, + "step": 9427 + }, + { + "epoch": 7.5303514376996805, + "grad_norm": 0.15984703600406647, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9428 + }, + { + "epoch": 7.531150159744409, + "grad_norm": 0.2948785126209259, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9429 + }, + { + "epoch": 7.531948881789138, + "grad_norm": 0.08823632448911667, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9430 + }, + { + "epoch": 7.532747603833866, + "grad_norm": 0.23016497492790222, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9431 + }, + { + "epoch": 7.533546325878595, + "grad_norm": 0.08874809741973877, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9432 + }, + { + "epoch": 7.534345047923322, + "grad_norm": 0.09074181318283081, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9433 + }, + { + "epoch": 7.535143769968051, + "grad_norm": 0.15151673555374146, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9434 + }, + { + "epoch": 7.535942492012779, + "grad_norm": 0.12276771664619446, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9435 + }, + { + "epoch": 7.536741214057508, + "grad_norm": 0.13978977501392365, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9436 + }, + { + "epoch": 7.537539936102236, + "grad_norm": 0.16208869218826294, + "learning_rate": 0.0005, + "loss": 1.0477, + "step": 9437 + }, + { + "epoch": 7.538338658146965, + "grad_norm": 0.16932648420333862, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9438 + }, + { + "epoch": 7.539137380191693, + "grad_norm": 0.09139750897884369, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9439 + }, + { + "epoch": 7.539936102236422, + "grad_norm": 0.11264985054731369, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9440 + }, + { + "epoch": 7.5407348242811505, + "grad_norm": 0.13534623384475708, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9441 + }, + { + "epoch": 7.541533546325878, + "grad_norm": 0.16307172179222107, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9442 + }, + { + "epoch": 7.542332268370607, + "grad_norm": 0.09774577617645264, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9443 + }, + { + "epoch": 7.543130990415335, + "grad_norm": 0.1296136975288391, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9444 + }, + { + "epoch": 7.543929712460064, + "grad_norm": 0.08055619895458221, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9445 + }, + { + "epoch": 7.544728434504792, + "grad_norm": 0.2668273448944092, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9446 + }, + { + "epoch": 7.545527156549521, + "grad_norm": 0.1507730782032013, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9447 + }, + { + "epoch": 7.546325878594249, + "grad_norm": 0.17098994553089142, + "learning_rate": 0.0005, + "loss": 1.0486, + "step": 9448 + }, + { + "epoch": 7.547124600638978, + "grad_norm": 0.22425173223018646, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9449 + }, + { + "epoch": 7.547923322683706, + "grad_norm": 0.3074493706226349, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9450 + }, + { + "epoch": 7.548722044728435, + "grad_norm": 0.1917268931865692, + "learning_rate": 0.0005, + "loss": 1.0587, + "step": 9451 + }, + { + "epoch": 7.549520766773163, + "grad_norm": 0.21276478469371796, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9452 + }, + { + "epoch": 7.550319488817891, + "grad_norm": 0.2990981638431549, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9453 + }, + { + "epoch": 7.55111821086262, + "grad_norm": 0.21135985851287842, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9454 + }, + { + "epoch": 7.551916932907348, + "grad_norm": 0.1154661774635315, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9455 + }, + { + "epoch": 7.552715654952077, + "grad_norm": 0.13149744272232056, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9456 + }, + { + "epoch": 7.553514376996805, + "grad_norm": 0.36513134837150574, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9457 + }, + { + "epoch": 7.554313099041534, + "grad_norm": 0.2005227655172348, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9458 + }, + { + "epoch": 7.555111821086262, + "grad_norm": 0.22272491455078125, + "learning_rate": 0.0005, + "loss": 1.0511, + "step": 9459 + }, + { + "epoch": 7.555910543130991, + "grad_norm": 0.05990196391940117, + "learning_rate": 0.0005, + "loss": 1.0622, + "step": 9460 + }, + { + "epoch": 7.556709265175719, + "grad_norm": 0.20874981582164764, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9461 + }, + { + "epoch": 7.557507987220447, + "grad_norm": 0.10478242486715317, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9462 + }, + { + "epoch": 7.5583067092651754, + "grad_norm": 0.2455470710992813, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9463 + }, + { + "epoch": 7.559105431309904, + "grad_norm": 0.31378838419914246, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9464 + }, + { + "epoch": 7.5599041533546325, + "grad_norm": 0.1903901994228363, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9465 + }, + { + "epoch": 7.560702875399361, + "grad_norm": 0.34334853291511536, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9466 + }, + { + "epoch": 7.56150159744409, + "grad_norm": 0.20050539076328278, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9467 + }, + { + "epoch": 7.562300319488818, + "grad_norm": 0.14147023856639862, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9468 + }, + { + "epoch": 7.563099041533547, + "grad_norm": 0.2242746353149414, + "learning_rate": 0.0005, + "loss": 1.0599, + "step": 9469 + }, + { + "epoch": 7.563897763578275, + "grad_norm": 0.10040932893753052, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9470 + }, + { + "epoch": 7.564696485623003, + "grad_norm": 0.2527815103530884, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9471 + }, + { + "epoch": 7.565495207667731, + "grad_norm": 0.1675105094909668, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9472 + }, + { + "epoch": 7.56629392971246, + "grad_norm": 0.23818080127239227, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9473 + }, + { + "epoch": 7.567092651757188, + "grad_norm": 0.31956857442855835, + "learning_rate": 0.0005, + "loss": 1.0647, + "step": 9474 + }, + { + "epoch": 7.567891373801917, + "grad_norm": 0.15272031724452972, + "learning_rate": 0.0005, + "loss": 1.044, + "step": 9475 + }, + { + "epoch": 7.568690095846645, + "grad_norm": 0.20540206134319305, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9476 + }, + { + "epoch": 7.569488817891374, + "grad_norm": 0.2269754856824875, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9477 + }, + { + "epoch": 7.5702875399361025, + "grad_norm": 0.19880101084709167, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9478 + }, + { + "epoch": 7.571086261980831, + "grad_norm": 0.2734098732471466, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9479 + }, + { + "epoch": 7.571884984025559, + "grad_norm": 0.17886638641357422, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9480 + }, + { + "epoch": 7.572683706070287, + "grad_norm": 0.15882767736911774, + "learning_rate": 0.0005, + "loss": 1.0525, + "step": 9481 + }, + { + "epoch": 7.573482428115016, + "grad_norm": 0.18066628277301788, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9482 + }, + { + "epoch": 7.574281150159744, + "grad_norm": 0.1025780662894249, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9483 + }, + { + "epoch": 7.575079872204473, + "grad_norm": 0.09417031705379486, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9484 + }, + { + "epoch": 7.575878594249201, + "grad_norm": 0.26811933517456055, + "learning_rate": 0.0005, + "loss": 1.0632, + "step": 9485 + }, + { + "epoch": 7.57667731629393, + "grad_norm": 0.07128968089818954, + "learning_rate": 0.0005, + "loss": 1.0538, + "step": 9486 + }, + { + "epoch": 7.577476038338658, + "grad_norm": 0.13026759028434753, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9487 + }, + { + "epoch": 7.578274760383387, + "grad_norm": 0.09879457950592041, + "learning_rate": 0.0005, + "loss": 1.0625, + "step": 9488 + }, + { + "epoch": 7.5790734824281145, + "grad_norm": 0.15383538603782654, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9489 + }, + { + "epoch": 7.579872204472844, + "grad_norm": 0.17010194063186646, + "learning_rate": 0.0005, + "loss": 1.0665, + "step": 9490 + }, + { + "epoch": 7.580670926517572, + "grad_norm": 0.09413834661245346, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9491 + }, + { + "epoch": 7.5814696485623, + "grad_norm": 0.13111010193824768, + "learning_rate": 0.0005, + "loss": 1.0452, + "step": 9492 + }, + { + "epoch": 7.582268370607029, + "grad_norm": 0.14170758426189423, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9493 + }, + { + "epoch": 7.583067092651757, + "grad_norm": 0.10549119114875793, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9494 + }, + { + "epoch": 7.583865814696486, + "grad_norm": 0.06767291575670242, + "learning_rate": 0.0005, + "loss": 1.0571, + "step": 9495 + }, + { + "epoch": 7.584664536741214, + "grad_norm": 0.3329547643661499, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9496 + }, + { + "epoch": 7.585463258785943, + "grad_norm": 0.09325312823057175, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9497 + }, + { + "epoch": 7.586261980830671, + "grad_norm": 0.11408714950084686, + "learning_rate": 0.0005, + "loss": 1.0458, + "step": 9498 + }, + { + "epoch": 7.5870607028754, + "grad_norm": 0.10127131640911102, + "learning_rate": 0.0005, + "loss": 1.0481, + "step": 9499 + }, + { + "epoch": 7.587859424920127, + "grad_norm": 0.14656123518943787, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9500 + }, + { + "epoch": 7.588658146964856, + "grad_norm": 0.33641156554222107, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9501 + }, + { + "epoch": 7.5894568690095845, + "grad_norm": 0.09339869022369385, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9502 + }, + { + "epoch": 7.590255591054313, + "grad_norm": 0.10584868490695953, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9503 + }, + { + "epoch": 7.5910543130990416, + "grad_norm": 0.09518138319253922, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9504 + }, + { + "epoch": 7.59185303514377, + "grad_norm": 0.07680968940258026, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9505 + }, + { + "epoch": 7.592651757188499, + "grad_norm": 0.19037210941314697, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9506 + }, + { + "epoch": 7.593450479233227, + "grad_norm": 0.06012401729822159, + "learning_rate": 0.0005, + "loss": 1.0564, + "step": 9507 + }, + { + "epoch": 7.594249201277956, + "grad_norm": 0.08509133756160736, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9508 + }, + { + "epoch": 7.595047923322683, + "grad_norm": 0.3013906478881836, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9509 + }, + { + "epoch": 7.595846645367412, + "grad_norm": 0.19873085618019104, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9510 + }, + { + "epoch": 7.59664536741214, + "grad_norm": 0.16749307513237, + "learning_rate": 0.0005, + "loss": 1.0568, + "step": 9511 + }, + { + "epoch": 7.597444089456869, + "grad_norm": 0.18683338165283203, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9512 + }, + { + "epoch": 7.598242811501597, + "grad_norm": 0.16748754680156708, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9513 + }, + { + "epoch": 7.599041533546326, + "grad_norm": 0.11243029683828354, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9514 + }, + { + "epoch": 7.5998402555910545, + "grad_norm": 0.08024061471223831, + "learning_rate": 0.0005, + "loss": 1.06, + "step": 9515 + }, + { + "epoch": 7.600638977635783, + "grad_norm": 0.20173156261444092, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9516 + }, + { + "epoch": 7.6014376996805115, + "grad_norm": 0.09208648651838303, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9517 + }, + { + "epoch": 7.602236421725239, + "grad_norm": 0.14459539949893951, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9518 + }, + { + "epoch": 7.603035143769968, + "grad_norm": 0.12492551654577255, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9519 + }, + { + "epoch": 7.603833865814696, + "grad_norm": 0.0866885557770729, + "learning_rate": 0.0005, + "loss": 1.0497, + "step": 9520 + }, + { + "epoch": 7.604632587859425, + "grad_norm": 0.18499130010604858, + "learning_rate": 0.0005, + "loss": 1.0476, + "step": 9521 + }, + { + "epoch": 7.605431309904153, + "grad_norm": 0.15807269513607025, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9522 + }, + { + "epoch": 7.606230031948882, + "grad_norm": 0.0766368880867958, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9523 + }, + { + "epoch": 7.60702875399361, + "grad_norm": 0.14770840108394623, + "learning_rate": 0.0005, + "loss": 1.0611, + "step": 9524 + }, + { + "epoch": 7.607827476038339, + "grad_norm": 0.14121781289577484, + "learning_rate": 0.0005, + "loss": 1.0589, + "step": 9525 + }, + { + "epoch": 7.608626198083067, + "grad_norm": 0.061000190675258636, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9526 + }, + { + "epoch": 7.609424920127795, + "grad_norm": 0.10725796967744827, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9527 + }, + { + "epoch": 7.6102236421725244, + "grad_norm": 0.4449822008609772, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9528 + }, + { + "epoch": 7.611022364217252, + "grad_norm": 0.4324694275856018, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9529 + }, + { + "epoch": 7.611821086261981, + "grad_norm": 0.2345885932445526, + "learning_rate": 0.0005, + "loss": 1.0686, + "step": 9530 + }, + { + "epoch": 7.612619808306709, + "grad_norm": 0.38030850887298584, + "learning_rate": 0.0005, + "loss": 1.0635, + "step": 9531 + }, + { + "epoch": 7.613418530351438, + "grad_norm": 0.28466367721557617, + "learning_rate": 0.0005, + "loss": 1.0623, + "step": 9532 + }, + { + "epoch": 7.614217252396166, + "grad_norm": 0.2688463032245636, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9533 + }, + { + "epoch": 7.615015974440895, + "grad_norm": 0.3490566313266754, + "learning_rate": 0.0005, + "loss": 1.0607, + "step": 9534 + }, + { + "epoch": 7.615814696485623, + "grad_norm": 0.17181244492530823, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9535 + }, + { + "epoch": 7.616613418530352, + "grad_norm": 0.2932468056678772, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9536 + }, + { + "epoch": 7.61741214057508, + "grad_norm": 0.07963605225086212, + "learning_rate": 0.0005, + "loss": 1.0629, + "step": 9537 + }, + { + "epoch": 7.618210862619808, + "grad_norm": 0.3166755437850952, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9538 + }, + { + "epoch": 7.6190095846645365, + "grad_norm": 0.13043160736560822, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9539 + }, + { + "epoch": 7.619808306709265, + "grad_norm": 0.22799645364284515, + "learning_rate": 0.0005, + "loss": 1.0593, + "step": 9540 + }, + { + "epoch": 7.6206070287539935, + "grad_norm": 0.13454940915107727, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9541 + }, + { + "epoch": 7.621405750798722, + "grad_norm": 0.25270769000053406, + "learning_rate": 0.0005, + "loss": 1.0594, + "step": 9542 + }, + { + "epoch": 7.622204472843451, + "grad_norm": 0.07556870579719543, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9543 + }, + { + "epoch": 7.623003194888179, + "grad_norm": 0.4405477046966553, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9544 + }, + { + "epoch": 7.623801916932908, + "grad_norm": 0.13088728487491608, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9545 + }, + { + "epoch": 7.624600638977636, + "grad_norm": 0.22698643803596497, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9546 + }, + { + "epoch": 7.625399361022364, + "grad_norm": 0.16014008224010468, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9547 + }, + { + "epoch": 7.626198083067092, + "grad_norm": 0.12253693491220474, + "learning_rate": 0.0005, + "loss": 1.0596, + "step": 9548 + }, + { + "epoch": 7.626996805111821, + "grad_norm": 0.19500789046287537, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9549 + }, + { + "epoch": 7.627795527156549, + "grad_norm": 0.049878381192684174, + "learning_rate": 0.0005, + "loss": 1.0563, + "step": 9550 + }, + { + "epoch": 7.628594249201278, + "grad_norm": 0.15125001966953278, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9551 + }, + { + "epoch": 7.6293929712460065, + "grad_norm": 0.6651006937026978, + "learning_rate": 0.0005, + "loss": 1.0526, + "step": 9552 + }, + { + "epoch": 7.630191693290735, + "grad_norm": 0.12418173998594284, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9553 + }, + { + "epoch": 7.6309904153354635, + "grad_norm": 0.0924144983291626, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9554 + }, + { + "epoch": 7.631789137380192, + "grad_norm": 0.16454379260540009, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9555 + }, + { + "epoch": 7.63258785942492, + "grad_norm": 0.23841862380504608, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9556 + }, + { + "epoch": 7.633386581469648, + "grad_norm": 0.11875062435865402, + "learning_rate": 0.0005, + "loss": 1.0498, + "step": 9557 + }, + { + "epoch": 7.634185303514377, + "grad_norm": 0.16778984665870667, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9558 + }, + { + "epoch": 7.634984025559105, + "grad_norm": 0.12286275625228882, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9559 + }, + { + "epoch": 7.635782747603834, + "grad_norm": 0.11795859038829803, + "learning_rate": 0.0005, + "loss": 1.0524, + "step": 9560 + }, + { + "epoch": 7.636581469648562, + "grad_norm": 0.10615531355142593, + "learning_rate": 0.0005, + "loss": 1.0621, + "step": 9561 + }, + { + "epoch": 7.637380191693291, + "grad_norm": 0.1273939311504364, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9562 + }, + { + "epoch": 7.638178913738019, + "grad_norm": 0.0739448294043541, + "learning_rate": 0.0005, + "loss": 1.0565, + "step": 9563 + }, + { + "epoch": 7.638977635782748, + "grad_norm": 0.3214738965034485, + "learning_rate": 0.0005, + "loss": 1.0639, + "step": 9564 + }, + { + "epoch": 7.6397763578274756, + "grad_norm": 0.13925962150096893, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9565 + }, + { + "epoch": 7.640575079872205, + "grad_norm": 0.07356422394514084, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9566 + }, + { + "epoch": 7.641373801916933, + "grad_norm": 0.0708729475736618, + "learning_rate": 0.0005, + "loss": 1.0545, + "step": 9567 + }, + { + "epoch": 7.642172523961661, + "grad_norm": 0.08209198713302612, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9568 + }, + { + "epoch": 7.64297124600639, + "grad_norm": 0.08787291496992111, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9569 + }, + { + "epoch": 7.643769968051118, + "grad_norm": 0.16093257069587708, + "learning_rate": 0.0005, + "loss": 1.0482, + "step": 9570 + }, + { + "epoch": 7.644568690095847, + "grad_norm": 0.17313137650489807, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9571 + }, + { + "epoch": 7.645367412140575, + "grad_norm": 0.09015117585659027, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9572 + }, + { + "epoch": 7.646166134185304, + "grad_norm": 0.06029650941491127, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9573 + }, + { + "epoch": 7.646964856230032, + "grad_norm": 0.15379463136196136, + "learning_rate": 0.0005, + "loss": 1.0503, + "step": 9574 + }, + { + "epoch": 7.647763578274761, + "grad_norm": 0.11657056212425232, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9575 + }, + { + "epoch": 7.6485623003194885, + "grad_norm": 0.04901152476668358, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9576 + }, + { + "epoch": 7.649361022364217, + "grad_norm": 0.1282874494791031, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9577 + }, + { + "epoch": 7.6501597444089455, + "grad_norm": 0.10117272287607193, + "learning_rate": 0.0005, + "loss": 1.0513, + "step": 9578 + }, + { + "epoch": 7.650958466453674, + "grad_norm": 0.20984217524528503, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9579 + }, + { + "epoch": 7.651757188498403, + "grad_norm": 0.11340182274580002, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9580 + }, + { + "epoch": 7.652555910543131, + "grad_norm": 0.04095076769590378, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9581 + }, + { + "epoch": 7.65335463258786, + "grad_norm": 0.1021147221326828, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9582 + }, + { + "epoch": 7.654153354632588, + "grad_norm": 0.13590390980243683, + "learning_rate": 0.0005, + "loss": 1.0652, + "step": 9583 + }, + { + "epoch": 7.654952076677317, + "grad_norm": 0.11193613708019257, + "learning_rate": 0.0005, + "loss": 1.047, + "step": 9584 + }, + { + "epoch": 7.655750798722044, + "grad_norm": 0.12928733229637146, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9585 + }, + { + "epoch": 7.656549520766773, + "grad_norm": 0.2061600685119629, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9586 + }, + { + "epoch": 7.657348242811501, + "grad_norm": 0.24827928841114044, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9587 + }, + { + "epoch": 7.65814696485623, + "grad_norm": 0.11018779128789902, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9588 + }, + { + "epoch": 7.6589456869009584, + "grad_norm": 0.13417914509773254, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9589 + }, + { + "epoch": 7.659744408945687, + "grad_norm": 0.11718209832906723, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9590 + }, + { + "epoch": 7.6605431309904155, + "grad_norm": 0.11689312011003494, + "learning_rate": 0.0005, + "loss": 1.062, + "step": 9591 + }, + { + "epoch": 7.661341853035144, + "grad_norm": 0.14541427791118622, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9592 + }, + { + "epoch": 7.662140575079873, + "grad_norm": 0.325338751077652, + "learning_rate": 0.0005, + "loss": 1.0502, + "step": 9593 + }, + { + "epoch": 7.6629392971246, + "grad_norm": 0.13927216827869415, + "learning_rate": 0.0005, + "loss": 1.0601, + "step": 9594 + }, + { + "epoch": 7.663738019169329, + "grad_norm": 0.06451129913330078, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9595 + }, + { + "epoch": 7.664536741214057, + "grad_norm": 0.12422754615545273, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9596 + }, + { + "epoch": 7.665335463258786, + "grad_norm": 0.10147815197706223, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9597 + }, + { + "epoch": 7.666134185303514, + "grad_norm": 0.168707475066185, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9598 + }, + { + "epoch": 7.666932907348243, + "grad_norm": 0.13256248831748962, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9599 + }, + { + "epoch": 7.667731629392971, + "grad_norm": 0.10466179251670837, + "learning_rate": 0.0005, + "loss": 1.0519, + "step": 9600 + }, + { + "epoch": 7.6685303514377, + "grad_norm": 0.1508362740278244, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9601 + }, + { + "epoch": 7.669329073482428, + "grad_norm": 0.10080639272928238, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9602 + }, + { + "epoch": 7.670127795527156, + "grad_norm": 0.2546437382698059, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9603 + }, + { + "epoch": 7.6709265175718855, + "grad_norm": 0.119930200278759, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9604 + }, + { + "epoch": 7.671725239616613, + "grad_norm": 0.12494632601737976, + "learning_rate": 0.0005, + "loss": 1.0475, + "step": 9605 + }, + { + "epoch": 7.672523961661342, + "grad_norm": 0.2126263678073883, + "learning_rate": 0.0005, + "loss": 1.0431, + "step": 9606 + }, + { + "epoch": 7.67332268370607, + "grad_norm": 0.058003541082143784, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 9607 + }, + { + "epoch": 7.674121405750799, + "grad_norm": 0.16652998328208923, + "learning_rate": 0.0005, + "loss": 1.0504, + "step": 9608 + }, + { + "epoch": 7.674920127795527, + "grad_norm": 0.18094705045223236, + "learning_rate": 0.0005, + "loss": 1.0533, + "step": 9609 + }, + { + "epoch": 7.675718849840256, + "grad_norm": 0.21123206615447998, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9610 + }, + { + "epoch": 7.676517571884984, + "grad_norm": 0.14626245200634003, + "learning_rate": 0.0005, + "loss": 1.0465, + "step": 9611 + }, + { + "epoch": 7.677316293929713, + "grad_norm": 0.16082580387592316, + "learning_rate": 0.0005, + "loss": 1.0523, + "step": 9612 + }, + { + "epoch": 7.678115015974441, + "grad_norm": 0.2231828272342682, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9613 + }, + { + "epoch": 7.678913738019169, + "grad_norm": 0.0519767664372921, + "learning_rate": 0.0005, + "loss": 1.0514, + "step": 9614 + }, + { + "epoch": 7.6797124600638975, + "grad_norm": 0.11062952876091003, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9615 + }, + { + "epoch": 7.680511182108626, + "grad_norm": 0.1141565814614296, + "learning_rate": 0.0005, + "loss": 1.0495, + "step": 9616 + }, + { + "epoch": 7.681309904153355, + "grad_norm": 0.05675462633371353, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9617 + }, + { + "epoch": 7.682108626198083, + "grad_norm": 0.06369207054376602, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9618 + }, + { + "epoch": 7.682907348242812, + "grad_norm": 0.6857787370681763, + "learning_rate": 0.0005, + "loss": 1.0604, + "step": 9619 + }, + { + "epoch": 7.68370607028754, + "grad_norm": 0.14775703847408295, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9620 + }, + { + "epoch": 7.684504792332269, + "grad_norm": 0.11832108348608017, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9621 + }, + { + "epoch": 7.685303514376997, + "grad_norm": 0.0789126604795456, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9622 + }, + { + "epoch": 7.686102236421725, + "grad_norm": 0.09771233052015305, + "learning_rate": 0.0005, + "loss": 1.0608, + "step": 9623 + }, + { + "epoch": 7.686900958466453, + "grad_norm": 0.1002877801656723, + "learning_rate": 0.0005, + "loss": 1.0528, + "step": 9624 + }, + { + "epoch": 7.687699680511182, + "grad_norm": 0.09265508502721786, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9625 + }, + { + "epoch": 7.68849840255591, + "grad_norm": 0.18757182359695435, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9626 + }, + { + "epoch": 7.689297124600639, + "grad_norm": 0.07585754990577698, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9627 + }, + { + "epoch": 7.6900958466453675, + "grad_norm": 0.08716554194688797, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9628 + }, + { + "epoch": 7.690894568690096, + "grad_norm": 0.12742596864700317, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9629 + }, + { + "epoch": 7.6916932907348246, + "grad_norm": 0.3201116621494293, + "learning_rate": 0.0005, + "loss": 1.0615, + "step": 9630 + }, + { + "epoch": 7.692492012779553, + "grad_norm": 0.13922421634197235, + "learning_rate": 0.0005, + "loss": 1.0542, + "step": 9631 + }, + { + "epoch": 7.693290734824281, + "grad_norm": 0.1482384353876114, + "learning_rate": 0.0005, + "loss": 1.0552, + "step": 9632 + }, + { + "epoch": 7.694089456869009, + "grad_norm": 0.44062909483909607, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9633 + }, + { + "epoch": 7.694888178913738, + "grad_norm": 0.09945067763328552, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9634 + }, + { + "epoch": 7.695686900958466, + "grad_norm": 0.09670528769493103, + "learning_rate": 0.0005, + "loss": 1.0553, + "step": 9635 + }, + { + "epoch": 7.696485623003195, + "grad_norm": 0.21770933270454407, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9636 + }, + { + "epoch": 7.697284345047923, + "grad_norm": 0.08205332607030869, + "learning_rate": 0.0005, + "loss": 1.0586, + "step": 9637 + }, + { + "epoch": 7.698083067092652, + "grad_norm": 0.20427794754505157, + "learning_rate": 0.0005, + "loss": 1.061, + "step": 9638 + }, + { + "epoch": 7.69888178913738, + "grad_norm": 0.10897956788539886, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9639 + }, + { + "epoch": 7.699680511182109, + "grad_norm": 0.09363125264644623, + "learning_rate": 0.0005, + "loss": 1.0536, + "step": 9640 + }, + { + "epoch": 7.700479233226837, + "grad_norm": 0.11652582138776779, + "learning_rate": 0.0005, + "loss": 1.0573, + "step": 9641 + }, + { + "epoch": 7.701277955271565, + "grad_norm": 0.11023811250925064, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9642 + }, + { + "epoch": 7.702076677316294, + "grad_norm": 0.0836176723241806, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9643 + }, + { + "epoch": 7.702875399361022, + "grad_norm": 0.16838227212429047, + "learning_rate": 0.0005, + "loss": 1.0529, + "step": 9644 + }, + { + "epoch": 7.703674121405751, + "grad_norm": 0.0736071765422821, + "learning_rate": 0.0005, + "loss": 1.0619, + "step": 9645 + }, + { + "epoch": 7.704472843450479, + "grad_norm": 0.12039043009281158, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9646 + }, + { + "epoch": 7.705271565495208, + "grad_norm": 0.08551492542028427, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9647 + }, + { + "epoch": 7.706070287539936, + "grad_norm": 0.4940882921218872, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9648 + }, + { + "epoch": 7.706869009584665, + "grad_norm": 0.09832077473402023, + "learning_rate": 0.0005, + "loss": 1.0509, + "step": 9649 + }, + { + "epoch": 7.707667731629393, + "grad_norm": 0.059512801468372345, + "learning_rate": 0.0005, + "loss": 1.0549, + "step": 9650 + }, + { + "epoch": 7.708466453674122, + "grad_norm": 0.10426498204469681, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9651 + }, + { + "epoch": 7.7092651757188495, + "grad_norm": 0.07802911102771759, + "learning_rate": 0.0005, + "loss": 1.0546, + "step": 9652 + }, + { + "epoch": 7.710063897763578, + "grad_norm": 0.07615980505943298, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9653 + }, + { + "epoch": 7.710862619808307, + "grad_norm": 0.0960751548409462, + "learning_rate": 0.0005, + "loss": 1.0515, + "step": 9654 + }, + { + "epoch": 7.711661341853035, + "grad_norm": 0.2435871958732605, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9655 + }, + { + "epoch": 7.712460063897764, + "grad_norm": 0.05712791904807091, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9656 + }, + { + "epoch": 7.713258785942492, + "grad_norm": 0.08460236340761185, + "learning_rate": 0.0005, + "loss": 1.059, + "step": 9657 + }, + { + "epoch": 7.714057507987221, + "grad_norm": 0.07319195568561554, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9658 + }, + { + "epoch": 7.714856230031949, + "grad_norm": 0.09150193631649017, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9659 + }, + { + "epoch": 7.715654952076678, + "grad_norm": 0.1096913143992424, + "learning_rate": 0.0005, + "loss": 1.0576, + "step": 9660 + }, + { + "epoch": 7.716453674121405, + "grad_norm": 0.0675668716430664, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9661 + }, + { + "epoch": 7.717252396166134, + "grad_norm": 0.0719466358423233, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9662 + }, + { + "epoch": 7.718051118210862, + "grad_norm": 0.0392761304974556, + "learning_rate": 0.0005, + "loss": 1.0507, + "step": 9663 + }, + { + "epoch": 7.718849840255591, + "grad_norm": 0.0673295333981514, + "learning_rate": 0.0005, + "loss": 1.0567, + "step": 9664 + }, + { + "epoch": 7.7196485623003195, + "grad_norm": 0.10867837816476822, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9665 + }, + { + "epoch": 7.720447284345048, + "grad_norm": 0.6895002126693726, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9666 + }, + { + "epoch": 7.7212460063897765, + "grad_norm": 0.09527186304330826, + "learning_rate": 0.0005, + "loss": 1.0512, + "step": 9667 + }, + { + "epoch": 7.722044728434505, + "grad_norm": 0.11535433679819107, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9668 + }, + { + "epoch": 7.722843450479234, + "grad_norm": 0.08127015084028244, + "learning_rate": 0.0005, + "loss": 1.0521, + "step": 9669 + }, + { + "epoch": 7.723642172523961, + "grad_norm": 0.06600163877010345, + "learning_rate": 0.0005, + "loss": 1.0559, + "step": 9670 + }, + { + "epoch": 7.72444089456869, + "grad_norm": 0.1283862143754959, + "learning_rate": 0.0005, + "loss": 1.0522, + "step": 9671 + }, + { + "epoch": 7.725239616613418, + "grad_norm": 0.04981343448162079, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9672 + }, + { + "epoch": 7.726038338658147, + "grad_norm": 0.08641577512025833, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9673 + }, + { + "epoch": 7.726837060702875, + "grad_norm": 0.0503465011715889, + "learning_rate": 0.0005, + "loss": 1.0535, + "step": 9674 + }, + { + "epoch": 7.727635782747604, + "grad_norm": 0.08342859894037247, + "learning_rate": 0.0005, + "loss": 1.0616, + "step": 9675 + }, + { + "epoch": 7.728434504792332, + "grad_norm": 0.11919692903757095, + "learning_rate": 0.0005, + "loss": 1.0631, + "step": 9676 + }, + { + "epoch": 7.729233226837061, + "grad_norm": 0.2119137942790985, + "learning_rate": 0.0005, + "loss": 1.0537, + "step": 9677 + }, + { + "epoch": 7.7300319488817895, + "grad_norm": 0.04871589317917824, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9678 + }, + { + "epoch": 7.730830670926517, + "grad_norm": 0.09571115672588348, + "learning_rate": 0.0005, + "loss": 1.0508, + "step": 9679 + }, + { + "epoch": 7.731629392971246, + "grad_norm": 0.07192373275756836, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9680 + }, + { + "epoch": 7.732428115015974, + "grad_norm": 1.5483330488204956, + "learning_rate": 0.0005, + "loss": 1.049, + "step": 9681 + }, + { + "epoch": 7.733226837060703, + "grad_norm": 0.11159799993038177, + "learning_rate": 0.0005, + "loss": 1.058, + "step": 9682 + }, + { + "epoch": 7.734025559105431, + "grad_norm": 0.1700333058834076, + "learning_rate": 0.0005, + "loss": 1.0572, + "step": 9683 + }, + { + "epoch": 7.73482428115016, + "grad_norm": 0.06227154657244682, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9684 + }, + { + "epoch": 7.735623003194888, + "grad_norm": 0.46623915433883667, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9685 + }, + { + "epoch": 7.736421725239617, + "grad_norm": 0.235361710190773, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9686 + }, + { + "epoch": 7.737220447284345, + "grad_norm": 0.11328862607479095, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9687 + }, + { + "epoch": 7.738019169329074, + "grad_norm": 0.05050930753350258, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9688 + }, + { + "epoch": 7.738817891373802, + "grad_norm": 0.12438485771417618, + "learning_rate": 0.0005, + "loss": 1.052, + "step": 9689 + }, + { + "epoch": 7.73961661341853, + "grad_norm": 0.08985885977745056, + "learning_rate": 0.0005, + "loss": 1.055, + "step": 9690 + }, + { + "epoch": 7.7404153354632586, + "grad_norm": 0.09222928434610367, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9691 + }, + { + "epoch": 7.741214057507987, + "grad_norm": 0.0879027396440506, + "learning_rate": 0.0005, + "loss": 1.048, + "step": 9692 + }, + { + "epoch": 7.742012779552716, + "grad_norm": 0.18554086983203888, + "learning_rate": 0.0005, + "loss": 1.0588, + "step": 9693 + }, + { + "epoch": 7.742811501597444, + "grad_norm": 0.197623610496521, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9694 + }, + { + "epoch": 7.743610223642173, + "grad_norm": 0.07009958475828171, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9695 + }, + { + "epoch": 7.744408945686901, + "grad_norm": 0.059514936059713364, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9696 + }, + { + "epoch": 7.74520766773163, + "grad_norm": 0.6503719687461853, + "learning_rate": 0.0005, + "loss": 1.0532, + "step": 9697 + }, + { + "epoch": 7.746006389776358, + "grad_norm": 0.4739440977573395, + "learning_rate": 0.0005, + "loss": 1.0562, + "step": 9698 + }, + { + "epoch": 7.746805111821086, + "grad_norm": 0.15581674873828888, + "learning_rate": 0.0005, + "loss": 1.0605, + "step": 9699 + }, + { + "epoch": 7.747603833865814, + "grad_norm": 0.3622123897075653, + "learning_rate": 0.0005, + "loss": 1.0624, + "step": 9700 + }, + { + "epoch": 7.748402555910543, + "grad_norm": 0.16665314137935638, + "learning_rate": 0.0005, + "loss": 1.0637, + "step": 9701 + }, + { + "epoch": 7.7492012779552715, + "grad_norm": 0.2903657853603363, + "learning_rate": 0.0005, + "loss": 1.0627, + "step": 9702 + }, + { + "epoch": 7.75, + "grad_norm": 0.1619565337896347, + "learning_rate": 0.0005, + "loss": 1.0582, + "step": 9703 + }, + { + "epoch": 7.7507987220447285, + "grad_norm": 0.816677987575531, + "learning_rate": 0.0005, + "loss": 1.071, + "step": 9704 + }, + { + "epoch": 7.751597444089457, + "grad_norm": 0.20582620799541473, + "learning_rate": 0.0005, + "loss": 1.0506, + "step": 9705 + }, + { + "epoch": 7.752396166134186, + "grad_norm": 0.24247393012046814, + "learning_rate": 0.0005, + "loss": 1.0658, + "step": 9706 + }, + { + "epoch": 7.753194888178914, + "grad_norm": 0.2543368339538574, + "learning_rate": 0.0005, + "loss": 1.0591, + "step": 9707 + }, + { + "epoch": 7.753993610223642, + "grad_norm": 0.13885188102722168, + "learning_rate": 0.0005, + "loss": 1.0505, + "step": 9708 + }, + { + "epoch": 7.75479233226837, + "grad_norm": 0.22553203999996185, + "learning_rate": 0.0005, + "loss": 1.0501, + "step": 9709 + }, + { + "epoch": 7.755591054313099, + "grad_norm": 0.4961666762828827, + "learning_rate": 0.0005, + "loss": 1.0543, + "step": 9710 + }, + { + "epoch": 7.756389776357827, + "grad_norm": 0.15139277279376984, + "learning_rate": 0.0005, + "loss": 1.0554, + "step": 9711 + }, + { + "epoch": 7.757188498402556, + "grad_norm": 0.1196078360080719, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9712 + }, + { + "epoch": 7.757987220447284, + "grad_norm": 0.21309585869312286, + "learning_rate": 0.0005, + "loss": 1.0547, + "step": 9713 + }, + { + "epoch": 7.758785942492013, + "grad_norm": 0.07756591588258743, + "learning_rate": 0.0005, + "loss": 1.0694, + "step": 9714 + }, + { + "epoch": 7.7595846645367414, + "grad_norm": 0.1986755132675171, + "learning_rate": 0.0005, + "loss": 1.0641, + "step": 9715 + }, + { + "epoch": 7.76038338658147, + "grad_norm": 0.08994139730930328, + "learning_rate": 0.0005, + "loss": 1.0606, + "step": 9716 + }, + { + "epoch": 7.761182108626198, + "grad_norm": 0.19416365027427673, + "learning_rate": 0.0005, + "loss": 1.0583, + "step": 9717 + }, + { + "epoch": 7.761980830670926, + "grad_norm": 0.08997556567192078, + "learning_rate": 0.0005, + "loss": 1.0517, + "step": 9718 + }, + { + "epoch": 7.762779552715655, + "grad_norm": 0.16295979917049408, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9719 + }, + { + "epoch": 7.763578274760383, + "grad_norm": 0.15271921455860138, + "learning_rate": 0.0005, + "loss": 1.0581, + "step": 9720 + }, + { + "epoch": 7.764376996805112, + "grad_norm": 0.15118274092674255, + "learning_rate": 0.0005, + "loss": 1.0646, + "step": 9721 + }, + { + "epoch": 7.76517571884984, + "grad_norm": 0.14820800721645355, + "learning_rate": 0.0005, + "loss": 1.0527, + "step": 9722 + }, + { + "epoch": 7.765974440894569, + "grad_norm": 0.08788670599460602, + "learning_rate": 0.0005, + "loss": 1.0556, + "step": 9723 + }, + { + "epoch": 7.766773162939297, + "grad_norm": 0.13634555041790009, + "learning_rate": 0.0005, + "loss": 1.0626, + "step": 9724 + }, + { + "epoch": 7.767571884984026, + "grad_norm": 0.3266567885875702, + "learning_rate": 0.0005, + "loss": 1.0657, + "step": 9725 + }, + { + "epoch": 7.768370607028754, + "grad_norm": 0.14486448466777802, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9726 + }, + { + "epoch": 7.769169329073483, + "grad_norm": 0.1453651785850525, + "learning_rate": 0.0005, + "loss": 1.0613, + "step": 9727 + }, + { + "epoch": 7.7699680511182105, + "grad_norm": 0.09860636293888092, + "learning_rate": 0.0005, + "loss": 1.0541, + "step": 9728 + }, + { + "epoch": 7.770766773162939, + "grad_norm": 0.1391478180885315, + "learning_rate": 0.0005, + "loss": 1.065, + "step": 9729 + }, + { + "epoch": 7.771565495207668, + "grad_norm": 0.09883351624011993, + "learning_rate": 0.0005, + "loss": 1.0544, + "step": 9730 + }, + { + "epoch": 7.772364217252396, + "grad_norm": 0.08394116163253784, + "learning_rate": 0.0005, + "loss": 1.0557, + "step": 9731 + }, + { + "epoch": 7.773162939297125, + "grad_norm": 0.09769196063280106, + "learning_rate": 0.0005, + "loss": 1.0561, + "step": 9732 + }, + { + "epoch": 7.773961661341853, + "grad_norm": 0.13514496386051178, + "learning_rate": 0.0005, + "loss": 1.0628, + "step": 9733 + }, + { + "epoch": 7.774760383386582, + "grad_norm": 0.042965635657310486, + "learning_rate": 0.0005, + "loss": 1.0566, + "step": 9734 + }, + { + "epoch": 7.77555910543131, + "grad_norm": 0.1645607203245163, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9735 + }, + { + "epoch": 7.776357827476039, + "grad_norm": 0.07206106185913086, + "learning_rate": 0.0005, + "loss": 1.0578, + "step": 9736 + }, + { + "epoch": 7.777156549520766, + "grad_norm": 0.12476811558008194, + "learning_rate": 0.0005, + "loss": 1.0633, + "step": 9737 + }, + { + "epoch": 7.777955271565495, + "grad_norm": 0.13698029518127441, + "learning_rate": 0.0005, + "loss": 1.0579, + "step": 9738 + }, + { + "epoch": 7.7787539936102235, + "grad_norm": 0.06305114924907684, + "learning_rate": 0.0005, + "loss": 1.0558, + "step": 9739 + }, + { + "epoch": 7.779552715654952, + "grad_norm": 0.08472646027803421, + "learning_rate": 0.0005, + "loss": 1.0539, + "step": 9740 + }, + { + "epoch": 7.7803514376996805, + "grad_norm": 0.11592312157154083, + "learning_rate": 0.0005, + "loss": 1.0534, + "step": 9741 + }, + { + "epoch": 7.781150159744409, + "grad_norm": 0.1425880789756775, + "learning_rate": 0.0005, + "loss": 1.0518, + "step": 9742 + }, + { + "epoch": 7.781948881789138, + "grad_norm": 0.15640930831432343, + "learning_rate": 0.0005, + "loss": 1.057, + "step": 9743 + }, + { + "epoch": 7.782747603833866, + "grad_norm": 0.10394492000341415, + "learning_rate": 0.0005, + "loss": 1.0598, + "step": 9744 + }, + { + "epoch": 7.783546325878595, + "grad_norm": 0.11625290662050247, + "learning_rate": 0.0005, + "loss": 1.0609, + "step": 9745 + }, + { + "epoch": 7.784345047923322, + "grad_norm": 0.10535796731710434, + "learning_rate": 0.0005, + "loss": 1.0548, + "step": 9746 + }, + { + "epoch": 7.785143769968051, + "grad_norm": 3.235619068145752, + "learning_rate": 0.0005, + "loss": 1.0577, + "step": 9747 + }, + { + "epoch": 7.785942492012779, + "grad_norm": 0.15474911034107208, + "learning_rate": 0.0005, + "loss": 1.0584, + "step": 9748 + }, + { + "epoch": 7.786741214057508, + "grad_norm": 0.15647299587726593, + "learning_rate": 0.0005, + "loss": 1.0555, + "step": 9749 + }, + { + "epoch": 7.787539936102236, + "grad_norm": 0.09747028350830078, + "learning_rate": 0.0005, + "loss": 1.0569, + "step": 9750 + } + ], + "logging_steps": 1.0, + "max_steps": 751200, + "num_input_tokens_seen": 0, + "num_train_epochs": 600, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.4605371755384013e+18, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9750/training_args.bin b/checkpoint-9750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e0162074424e3714af8119d3be2b6e69cbb5b9f2 --- /dev/null +++ b/checkpoint-9750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06816c37733f99d23f044cefd981b2f404a72ddf40fa59f794154596b842fa95 +size 6072