Romain-XV commited on
Commit
fecd3ff
·
verified ·
1 Parent(s): c53eec9

Training in progress, step 626, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b1dc78f2811fda88b6512f88c78676cb2fe7d46b216090d585f2fddf17cf2e5
3
  size 201892112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f7fe686ad7c64bd02d4fd094260bb621073341f3d4940d8aa14be3b59e6d64
3
  size 201892112
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4993232b70f7933ea0542f10f85ddae5170cc7f565380cdf9563cd04075734fc
3
  size 102864868
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31f576c838d24ff45665b08ea5486b4a27d0c8d600c5d252c3502ef97ab980e0
3
  size 102864868
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5c3744dee5ce5fe12a3912f55428a52f75277710f850c4f7eecbb8b78b9ac1a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d48d59ca308d922824d340dfcf616c74acd85da32e75e9dc52670721e9935a1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a69697be86f6228636dd6aabb85e071a7891a09986a73f06d5f0057c339b6544
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46d020344d4cdb625fa3ccdabe828dfdae277b492bb5282dbb9f16edd7c73161
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.384363055229187,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-500",
4
- "epoch": 1.5987210231814548,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3555,6 +3555,896 @@
3555
  "eval_samples_per_second": 23.99,
3556
  "eval_steps_per_second": 6.009,
3557
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3558
  }
3559
  ],
3560
  "logging_steps": 1,
@@ -3578,12 +4468,12 @@
3578
  "should_evaluate": false,
3579
  "should_log": false,
3580
  "should_save": true,
3581
- "should_training_stop": false
3582
  },
3583
  "attributes": {}
3584
  }
3585
  },
3586
- "total_flos": 2.405657490625659e+17,
3587
  "train_batch_size": 4,
3588
  "trial_name": null,
3589
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.379648208618164,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
+ "epoch": 2.0015987210231816,
5
  "eval_steps": 100,
6
+ "global_step": 626,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3555
  "eval_samples_per_second": 23.99,
3556
  "eval_steps_per_second": 6.009,
3557
  "step": 500
3558
+ },
3559
+ {
3560
+ "epoch": 1.6019184652278178,
3561
+ "grad_norm": 0.5501524209976196,
3562
+ "learning_rate": 1.9641287434001355e-05,
3563
+ "loss": 1.3995,
3564
+ "step": 501
3565
+ },
3566
+ {
3567
+ "epoch": 1.6051159072741807,
3568
+ "grad_norm": 0.457994282245636,
3569
+ "learning_rate": 1.9338789076247e-05,
3570
+ "loss": 1.2441,
3571
+ "step": 502
3572
+ },
3573
+ {
3574
+ "epoch": 1.6083133493205435,
3575
+ "grad_norm": 0.4338493347167969,
3576
+ "learning_rate": 1.9038388702229403e-05,
3577
+ "loss": 1.1424,
3578
+ "step": 503
3579
+ },
3580
+ {
3581
+ "epoch": 1.6115107913669064,
3582
+ "grad_norm": 0.5547595620155334,
3583
+ "learning_rate": 1.874009412530877e-05,
3584
+ "loss": 1.524,
3585
+ "step": 504
3586
+ },
3587
+ {
3588
+ "epoch": 1.6147082334132694,
3589
+ "grad_norm": 0.4631533920764923,
3590
+ "learning_rate": 1.8443913104073983e-05,
3591
+ "loss": 1.1378,
3592
+ "step": 505
3593
+ },
3594
+ {
3595
+ "epoch": 1.6179056754596322,
3596
+ "grad_norm": 0.4832412898540497,
3597
+ "learning_rate": 1.8149853342140645e-05,
3598
+ "loss": 1.3494,
3599
+ "step": 506
3600
+ },
3601
+ {
3602
+ "epoch": 1.6211031175059953,
3603
+ "grad_norm": 0.48691773414611816,
3604
+ "learning_rate": 1.7857922487950874e-05,
3605
+ "loss": 1.2168,
3606
+ "step": 507
3607
+ },
3608
+ {
3609
+ "epoch": 1.624300559552358,
3610
+ "grad_norm": 0.5417378544807434,
3611
+ "learning_rate": 1.7568128134574113e-05,
3612
+ "loss": 1.3404,
3613
+ "step": 508
3614
+ },
3615
+ {
3616
+ "epoch": 1.627498001598721,
3617
+ "grad_norm": 0.4586952328681946,
3618
+ "learning_rate": 1.728047781950999e-05,
3619
+ "loss": 1.2989,
3620
+ "step": 509
3621
+ },
3622
+ {
3623
+ "epoch": 1.630695443645084,
3624
+ "grad_norm": 0.4418376386165619,
3625
+ "learning_rate": 1.6994979024491942e-05,
3626
+ "loss": 1.1664,
3627
+ "step": 510
3628
+ },
3629
+ {
3630
+ "epoch": 1.6338928856914468,
3631
+ "grad_norm": 0.4903421700000763,
3632
+ "learning_rate": 1.671163917529285e-05,
3633
+ "loss": 1.2322,
3634
+ "step": 511
3635
+ },
3636
+ {
3637
+ "epoch": 1.6370903277378097,
3638
+ "grad_norm": 0.44814610481262207,
3639
+ "learning_rate": 1.64304656415317e-05,
3640
+ "loss": 1.1332,
3641
+ "step": 512
3642
+ },
3643
+ {
3644
+ "epoch": 1.6402877697841727,
3645
+ "grad_norm": 0.5581910610198975,
3646
+ "learning_rate": 1.6151465736482107e-05,
3647
+ "loss": 1.2723,
3648
+ "step": 513
3649
+ },
3650
+ {
3651
+ "epoch": 1.6434852118305354,
3652
+ "grad_norm": 0.5089916586875916,
3653
+ "learning_rate": 1.587464671688187e-05,
3654
+ "loss": 1.359,
3655
+ "step": 514
3656
+ },
3657
+ {
3658
+ "epoch": 1.6466826538768986,
3659
+ "grad_norm": 0.4594471752643585,
3660
+ "learning_rate": 1.5600015782744492e-05,
3661
+ "loss": 1.0479,
3662
+ "step": 515
3663
+ },
3664
+ {
3665
+ "epoch": 1.6498800959232613,
3666
+ "grad_norm": 0.48605117201805115,
3667
+ "learning_rate": 1.5327580077171587e-05,
3668
+ "loss": 1.1852,
3669
+ "step": 516
3670
+ },
3671
+ {
3672
+ "epoch": 1.6530775379696243,
3673
+ "grad_norm": 0.4928354322910309,
3674
+ "learning_rate": 1.5057346686167428e-05,
3675
+ "loss": 1.4087,
3676
+ "step": 517
3677
+ },
3678
+ {
3679
+ "epoch": 1.6562749800159873,
3680
+ "grad_norm": 0.5630521774291992,
3681
+ "learning_rate": 1.4789322638454351e-05,
3682
+ "loss": 1.5852,
3683
+ "step": 518
3684
+ },
3685
+ {
3686
+ "epoch": 1.65947242206235,
3687
+ "grad_norm": 0.46705761551856995,
3688
+ "learning_rate": 1.452351490529017e-05,
3689
+ "loss": 1.0012,
3690
+ "step": 519
3691
+ },
3692
+ {
3693
+ "epoch": 1.662669864108713,
3694
+ "grad_norm": 0.4570344388484955,
3695
+ "learning_rate": 1.4259930400286669e-05,
3696
+ "loss": 1.266,
3697
+ "step": 520
3698
+ },
3699
+ {
3700
+ "epoch": 1.665867306155076,
3701
+ "grad_norm": 0.4869745671749115,
3702
+ "learning_rate": 1.3998575979229944e-05,
3703
+ "loss": 1.4107,
3704
+ "step": 521
3705
+ },
3706
+ {
3707
+ "epoch": 1.6690647482014387,
3708
+ "grad_norm": 0.46870559453964233,
3709
+ "learning_rate": 1.373945843990192e-05,
3710
+ "loss": 1.0377,
3711
+ "step": 522
3712
+ },
3713
+ {
3714
+ "epoch": 1.6722621902478019,
3715
+ "grad_norm": 0.46983346343040466,
3716
+ "learning_rate": 1.3482584521903718e-05,
3717
+ "loss": 1.3612,
3718
+ "step": 523
3719
+ },
3720
+ {
3721
+ "epoch": 1.6754596322941646,
3722
+ "grad_norm": 0.5902780294418335,
3723
+ "learning_rate": 1.322796090648013e-05,
3724
+ "loss": 1.1495,
3725
+ "step": 524
3726
+ },
3727
+ {
3728
+ "epoch": 1.6786570743405276,
3729
+ "grad_norm": 0.5421798229217529,
3730
+ "learning_rate": 1.2975594216346144e-05,
3731
+ "loss": 1.4595,
3732
+ "step": 525
3733
+ },
3734
+ {
3735
+ "epoch": 1.6818545163868905,
3736
+ "grad_norm": 0.47739914059638977,
3737
+ "learning_rate": 1.272549101551438e-05,
3738
+ "loss": 1.296,
3739
+ "step": 526
3740
+ },
3741
+ {
3742
+ "epoch": 1.6850519584332533,
3743
+ "grad_norm": 0.46318313479423523,
3744
+ "learning_rate": 1.2477657809124631e-05,
3745
+ "loss": 1.2161,
3746
+ "step": 527
3747
+ },
3748
+ {
3749
+ "epoch": 1.6882494004796165,
3750
+ "grad_norm": 0.5958060026168823,
3751
+ "learning_rate": 1.2232101043274436e-05,
3752
+ "loss": 1.2663,
3753
+ "step": 528
3754
+ },
3755
+ {
3756
+ "epoch": 1.6914468425259792,
3757
+ "grad_norm": 0.44811907410621643,
3758
+ "learning_rate": 1.1988827104851574e-05,
3759
+ "loss": 1.1838,
3760
+ "step": 529
3761
+ },
3762
+ {
3763
+ "epoch": 1.6946442845723422,
3764
+ "grad_norm": 0.45057952404022217,
3765
+ "learning_rate": 1.1747842321367886e-05,
3766
+ "loss": 1.3221,
3767
+ "step": 530
3768
+ },
3769
+ {
3770
+ "epoch": 1.6978417266187051,
3771
+ "grad_norm": 0.4354062080383301,
3772
+ "learning_rate": 1.1509152960794666e-05,
3773
+ "loss": 1.2421,
3774
+ "step": 531
3775
+ },
3776
+ {
3777
+ "epoch": 1.7010391686650679,
3778
+ "grad_norm": 0.47361254692077637,
3779
+ "learning_rate": 1.1272765231399685e-05,
3780
+ "loss": 1.4439,
3781
+ "step": 532
3782
+ },
3783
+ {
3784
+ "epoch": 1.7042366107114308,
3785
+ "grad_norm": 0.4950743019580841,
3786
+ "learning_rate": 1.1038685281585736e-05,
3787
+ "loss": 1.3103,
3788
+ "step": 533
3789
+ },
3790
+ {
3791
+ "epoch": 1.7074340527577938,
3792
+ "grad_norm": 0.5186318159103394,
3793
+ "learning_rate": 1.0806919199730615e-05,
3794
+ "loss": 1.3485,
3795
+ "step": 534
3796
+ },
3797
+ {
3798
+ "epoch": 1.7106314948041565,
3799
+ "grad_norm": 0.38383203744888306,
3800
+ "learning_rate": 1.057747301402887e-05,
3801
+ "loss": 1.0194,
3802
+ "step": 535
3803
+ },
3804
+ {
3805
+ "epoch": 1.7138289368505197,
3806
+ "grad_norm": 0.4821476638317108,
3807
+ "learning_rate": 1.035035269233493e-05,
3808
+ "loss": 1.3887,
3809
+ "step": 536
3810
+ },
3811
+ {
3812
+ "epoch": 1.7170263788968825,
3813
+ "grad_norm": 0.5094884634017944,
3814
+ "learning_rate": 1.0125564142007948e-05,
3815
+ "loss": 1.2771,
3816
+ "step": 537
3817
+ },
3818
+ {
3819
+ "epoch": 1.7202238209432454,
3820
+ "grad_norm": 0.6006470918655396,
3821
+ "learning_rate": 9.903113209758096e-06,
3822
+ "loss": 1.4993,
3823
+ "step": 538
3824
+ },
3825
+ {
3826
+ "epoch": 1.7234212629896084,
3827
+ "grad_norm": 0.4611685276031494,
3828
+ "learning_rate": 9.683005681494506e-06,
3829
+ "loss": 1.1291,
3830
+ "step": 539
3831
+ },
3832
+ {
3833
+ "epoch": 1.7266187050359711,
3834
+ "grad_norm": 0.4802876114845276,
3835
+ "learning_rate": 9.465247282174805e-06,
3836
+ "loss": 1.3996,
3837
+ "step": 540
3838
+ },
3839
+ {
3840
+ "epoch": 1.729816147082334,
3841
+ "grad_norm": 0.4477510154247284,
3842
+ "learning_rate": 9.249843675656212e-06,
3843
+ "loss": 1.2913,
3844
+ "step": 541
3845
+ },
3846
+ {
3847
+ "epoch": 1.733013589128697,
3848
+ "grad_norm": 0.403728187084198,
3849
+ "learning_rate": 9.036800464548157e-06,
3850
+ "loss": 1.1401,
3851
+ "step": 542
3852
+ },
3853
+ {
3854
+ "epoch": 1.7362110311750598,
3855
+ "grad_norm": 0.553962230682373,
3856
+ "learning_rate": 8.826123190066671e-06,
3857
+ "loss": 1.4372,
3858
+ "step": 543
3859
+ },
3860
+ {
3861
+ "epoch": 1.739408473221423,
3862
+ "grad_norm": 0.5109530091285706,
3863
+ "learning_rate": 8.617817331890154e-06,
3864
+ "loss": 1.2767,
3865
+ "step": 544
3866
+ },
3867
+ {
3868
+ "epoch": 1.7426059152677857,
3869
+ "grad_norm": 0.4755638837814331,
3870
+ "learning_rate": 8.411888308016847e-06,
3871
+ "loss": 1.3341,
3872
+ "step": 545
3873
+ },
3874
+ {
3875
+ "epoch": 1.7458033573141487,
3876
+ "grad_norm": 0.4327421188354492,
3877
+ "learning_rate": 8.208341474624071e-06,
3878
+ "loss": 1.0796,
3879
+ "step": 546
3880
+ },
3881
+ {
3882
+ "epoch": 1.7490007993605117,
3883
+ "grad_norm": 0.465468168258667,
3884
+ "learning_rate": 8.00718212592868e-06,
3885
+ "loss": 1.3402,
3886
+ "step": 547
3887
+ },
3888
+ {
3889
+ "epoch": 1.7521982414068744,
3890
+ "grad_norm": 0.5804145932197571,
3891
+ "learning_rate": 7.808415494049514e-06,
3892
+ "loss": 1.5293,
3893
+ "step": 548
3894
+ },
3895
+ {
3896
+ "epoch": 1.7553956834532374,
3897
+ "grad_norm": 0.44623619318008423,
3898
+ "learning_rate": 7.612046748871327e-06,
3899
+ "loss": 1.0433,
3900
+ "step": 549
3901
+ },
3902
+ {
3903
+ "epoch": 1.7585931254996003,
3904
+ "grad_norm": 0.48140379786491394,
3905
+ "learning_rate": 7.4180809979102036e-06,
3906
+ "loss": 1.2669,
3907
+ "step": 550
3908
+ },
3909
+ {
3910
+ "epoch": 1.761790567545963,
3911
+ "grad_norm": 0.4790396988391876,
3912
+ "learning_rate": 7.226523286180776e-06,
3913
+ "loss": 1.2912,
3914
+ "step": 551
3915
+ },
3916
+ {
3917
+ "epoch": 1.7649880095923263,
3918
+ "grad_norm": 0.5187086462974548,
3919
+ "learning_rate": 7.0373785960650475e-06,
3920
+ "loss": 1.0535,
3921
+ "step": 552
3922
+ },
3923
+ {
3924
+ "epoch": 1.768185451638689,
3925
+ "grad_norm": 0.539508581161499,
3926
+ "learning_rate": 6.850651847182743e-06,
3927
+ "loss": 1.486,
3928
+ "step": 553
3929
+ },
3930
+ {
3931
+ "epoch": 1.771382893685052,
3932
+ "grad_norm": 0.5147770643234253,
3933
+ "learning_rate": 6.666347896263325e-06,
3934
+ "loss": 1.2978,
3935
+ "step": 554
3936
+ },
3937
+ {
3938
+ "epoch": 1.774580335731415,
3939
+ "grad_norm": 0.5139119625091553,
3940
+ "learning_rate": 6.4844715370197874e-06,
3941
+ "loss": 1.2245,
3942
+ "step": 555
3943
+ },
3944
+ {
3945
+ "epoch": 1.7777777777777777,
3946
+ "grad_norm": 0.4947463572025299,
3947
+ "learning_rate": 6.3050275000238414e-06,
3948
+ "loss": 1.1726,
3949
+ "step": 556
3950
+ },
3951
+ {
3952
+ "epoch": 1.7809752198241406,
3953
+ "grad_norm": 0.5504704713821411,
3954
+ "learning_rate": 6.128020452582917e-06,
3955
+ "loss": 1.4667,
3956
+ "step": 557
3957
+ },
3958
+ {
3959
+ "epoch": 1.7841726618705036,
3960
+ "grad_norm": 0.46442416310310364,
3961
+ "learning_rate": 5.953454998618857e-06,
3962
+ "loss": 1.2782,
3963
+ "step": 558
3964
+ },
3965
+ {
3966
+ "epoch": 1.7873701039168663,
3967
+ "grad_norm": 0.6431064009666443,
3968
+ "learning_rate": 5.781335678547995e-06,
3969
+ "loss": 1.4561,
3970
+ "step": 559
3971
+ },
3972
+ {
3973
+ "epoch": 1.7905675459632295,
3974
+ "grad_norm": 0.5108866095542908,
3975
+ "learning_rate": 5.611666969163243e-06,
3976
+ "loss": 1.3854,
3977
+ "step": 560
3978
+ },
3979
+ {
3980
+ "epoch": 1.7937649880095923,
3981
+ "grad_norm": 0.5375909209251404,
3982
+ "learning_rate": 5.4444532835175144e-06,
3983
+ "loss": 1.4556,
3984
+ "step": 561
3985
+ },
3986
+ {
3987
+ "epoch": 1.7969624300559552,
3988
+ "grad_norm": 0.4582259953022003,
3989
+ "learning_rate": 5.27969897080901e-06,
3990
+ "loss": 1.1938,
3991
+ "step": 562
3992
+ },
3993
+ {
3994
+ "epoch": 1.8001598721023182,
3995
+ "grad_norm": 0.4605944752693176,
3996
+ "learning_rate": 5.1174083162680465e-06,
3997
+ "loss": 1.3145,
3998
+ "step": 563
3999
+ },
4000
+ {
4001
+ "epoch": 1.803357314148681,
4002
+ "grad_norm": 0.4929840862751007,
4003
+ "learning_rate": 4.957585541045684e-06,
4004
+ "loss": 1.146,
4005
+ "step": 564
4006
+ },
4007
+ {
4008
+ "epoch": 1.8065547561950441,
4009
+ "grad_norm": 0.5457170009613037,
4010
+ "learning_rate": 4.800234802103842e-06,
4011
+ "loss": 1.4803,
4012
+ "step": 565
4013
+ },
4014
+ {
4015
+ "epoch": 1.8097521982414069,
4016
+ "grad_norm": 0.4208320379257202,
4017
+ "learning_rate": 4.6453601921072395e-06,
4018
+ "loss": 1.0077,
4019
+ "step": 566
4020
+ },
4021
+ {
4022
+ "epoch": 1.8129496402877698,
4023
+ "grad_norm": 0.4881625175476074,
4024
+ "learning_rate": 4.492965739316901e-06,
4025
+ "loss": 1.2129,
4026
+ "step": 567
4027
+ },
4028
+ {
4029
+ "epoch": 1.8161470823341328,
4030
+ "grad_norm": 0.4958392381668091,
4031
+ "learning_rate": 4.34305540748543e-06,
4032
+ "loss": 1.3164,
4033
+ "step": 568
4034
+ },
4035
+ {
4036
+ "epoch": 1.8193445243804955,
4037
+ "grad_norm": 0.5175157785415649,
4038
+ "learning_rate": 4.195633095753859e-06,
4039
+ "loss": 1.5108,
4040
+ "step": 569
4041
+ },
4042
+ {
4043
+ "epoch": 1.8225419664268585,
4044
+ "grad_norm": 0.4751617908477783,
4045
+ "learning_rate": 4.050702638550275e-06,
4046
+ "loss": 1.2642,
4047
+ "step": 570
4048
+ },
4049
+ {
4050
+ "epoch": 1.8257394084732215,
4051
+ "grad_norm": 0.500234842300415,
4052
+ "learning_rate": 3.908267805490051e-06,
4053
+ "loss": 1.1798,
4054
+ "step": 571
4055
+ },
4056
+ {
4057
+ "epoch": 1.8289368505195842,
4058
+ "grad_norm": 0.4904455542564392,
4059
+ "learning_rate": 3.768332301277866e-06,
4060
+ "loss": 1.504,
4061
+ "step": 572
4062
+ },
4063
+ {
4064
+ "epoch": 1.8321342925659474,
4065
+ "grad_norm": 0.46577635407447815,
4066
+ "learning_rate": 3.630899765611251e-06,
4067
+ "loss": 1.2053,
4068
+ "step": 573
4069
+ },
4070
+ {
4071
+ "epoch": 1.8353317346123101,
4072
+ "grad_norm": 0.4044407308101654,
4073
+ "learning_rate": 3.495973773086014e-06,
4074
+ "loss": 0.9835,
4075
+ "step": 574
4076
+ },
4077
+ {
4078
+ "epoch": 1.838529176658673,
4079
+ "grad_norm": 0.5665880441665649,
4080
+ "learning_rate": 3.3635578331031814e-06,
4081
+ "loss": 1.6426,
4082
+ "step": 575
4083
+ },
4084
+ {
4085
+ "epoch": 1.841726618705036,
4086
+ "grad_norm": 0.5140945911407471,
4087
+ "learning_rate": 3.233655389777801e-06,
4088
+ "loss": 1.1846,
4089
+ "step": 576
4090
+ },
4091
+ {
4092
+ "epoch": 1.8449240607513988,
4093
+ "grad_norm": 0.4654732346534729,
4094
+ "learning_rate": 3.1062698218492724e-06,
4095
+ "loss": 1.4804,
4096
+ "step": 577
4097
+ },
4098
+ {
4099
+ "epoch": 1.8481215027977618,
4100
+ "grad_norm": 0.5238311290740967,
4101
+ "learning_rate": 2.9814044425935606e-06,
4102
+ "loss": 1.4453,
4103
+ "step": 578
4104
+ },
4105
+ {
4106
+ "epoch": 1.8513189448441247,
4107
+ "grad_norm": 0.47380512952804565,
4108
+ "learning_rate": 2.859062499736931e-06,
4109
+ "loss": 1.1711,
4110
+ "step": 579
4111
+ },
4112
+ {
4113
+ "epoch": 1.8545163868904875,
4114
+ "grad_norm": 0.4145112633705139,
4115
+ "learning_rate": 2.739247175371562e-06,
4116
+ "loss": 1.0746,
4117
+ "step": 580
4118
+ },
4119
+ {
4120
+ "epoch": 1.8577138289368507,
4121
+ "grad_norm": 0.4292459785938263,
4122
+ "learning_rate": 2.62196158587269e-06,
4123
+ "loss": 1.1786,
4124
+ "step": 581
4125
+ },
4126
+ {
4127
+ "epoch": 1.8609112709832134,
4128
+ "grad_norm": 0.45340195298194885,
4129
+ "learning_rate": 2.5072087818176382e-06,
4130
+ "loss": 1.2485,
4131
+ "step": 582
4132
+ },
4133
+ {
4134
+ "epoch": 1.8641087130295764,
4135
+ "grad_norm": 0.5513341426849365,
4136
+ "learning_rate": 2.3949917479063945e-06,
4137
+ "loss": 1.4376,
4138
+ "step": 583
4139
+ },
4140
+ {
4141
+ "epoch": 1.8673061550759393,
4142
+ "grad_norm": 0.4209807515144348,
4143
+ "learning_rate": 2.2853134028840594e-06,
4144
+ "loss": 1.1981,
4145
+ "step": 584
4146
+ },
4147
+ {
4148
+ "epoch": 1.870503597122302,
4149
+ "grad_norm": 0.47939029335975647,
4150
+ "learning_rate": 2.178176599464821e-06,
4151
+ "loss": 1.2066,
4152
+ "step": 585
4153
+ },
4154
+ {
4155
+ "epoch": 1.873701039168665,
4156
+ "grad_norm": 0.5361736416816711,
4157
+ "learning_rate": 2.073584124257899e-06,
4158
+ "loss": 1.4421,
4159
+ "step": 586
4160
+ },
4161
+ {
4162
+ "epoch": 1.876898481215028,
4163
+ "grad_norm": 0.4201466143131256,
4164
+ "learning_rate": 1.971538697694919e-06,
4165
+ "loss": 1.1246,
4166
+ "step": 587
4167
+ },
4168
+ {
4169
+ "epoch": 1.8800959232613907,
4170
+ "grad_norm": 0.5004400014877319,
4171
+ "learning_rate": 1.8720429739592982e-06,
4172
+ "loss": 1.3158,
4173
+ "step": 588
4174
+ },
4175
+ {
4176
+ "epoch": 1.883293365307754,
4177
+ "grad_norm": 0.4206116497516632,
4178
+ "learning_rate": 1.77509954091708e-06,
4179
+ "loss": 1.1231,
4180
+ "step": 589
4181
+ },
4182
+ {
4183
+ "epoch": 1.8864908073541167,
4184
+ "grad_norm": 0.5120659470558167,
4185
+ "learning_rate": 1.6807109200496995e-06,
4186
+ "loss": 1.3357,
4187
+ "step": 590
4188
+ },
4189
+ {
4190
+ "epoch": 1.8896882494004796,
4191
+ "grad_norm": 0.5396211743354797,
4192
+ "learning_rate": 1.5888795663883904e-06,
4193
+ "loss": 1.3456,
4194
+ "step": 591
4195
+ },
4196
+ {
4197
+ "epoch": 1.8928856914468426,
4198
+ "grad_norm": 1.0130178928375244,
4199
+ "learning_rate": 1.4996078684503144e-06,
4200
+ "loss": 1.0338,
4201
+ "step": 592
4202
+ },
4203
+ {
4204
+ "epoch": 1.8960831334932053,
4205
+ "grad_norm": 0.5105600953102112,
4206
+ "learning_rate": 1.4128981481764115e-06,
4207
+ "loss": 1.3242,
4208
+ "step": 593
4209
+ },
4210
+ {
4211
+ "epoch": 1.8992805755395683,
4212
+ "grad_norm": 0.5030739903450012,
4213
+ "learning_rate": 1.3287526608711131e-06,
4214
+ "loss": 1.2315,
4215
+ "step": 594
4216
+ },
4217
+ {
4218
+ "epoch": 1.9024780175859313,
4219
+ "grad_norm": 0.5108282566070557,
4220
+ "learning_rate": 1.247173595143536e-06,
4221
+ "loss": 1.3224,
4222
+ "step": 595
4223
+ },
4224
+ {
4225
+ "epoch": 1.905675459632294,
4226
+ "grad_norm": 0.45918774604797363,
4227
+ "learning_rate": 1.1681630728506699e-06,
4228
+ "loss": 1.181,
4229
+ "step": 596
4230
+ },
4231
+ {
4232
+ "epoch": 1.9088729016786572,
4233
+ "grad_norm": 0.49473392963409424,
4234
+ "learning_rate": 1.0917231490421232e-06,
4235
+ "loss": 1.4179,
4236
+ "step": 597
4237
+ },
4238
+ {
4239
+ "epoch": 1.91207034372502,
4240
+ "grad_norm": 0.4828788638114929,
4241
+ "learning_rate": 1.0178558119067315e-06,
4242
+ "loss": 1.3524,
4243
+ "step": 598
4244
+ },
4245
+ {
4246
+ "epoch": 1.915267785771383,
4247
+ "grad_norm": 0.9607488512992859,
4248
+ "learning_rate": 9.465629827207445e-07,
4249
+ "loss": 1.328,
4250
+ "step": 599
4251
+ },
4252
+ {
4253
+ "epoch": 1.9184652278177459,
4254
+ "grad_norm": 0.5251384377479553,
4255
+ "learning_rate": 8.778465157979976e-07,
4256
+ "loss": 1.3309,
4257
+ "step": 600
4258
+ },
4259
+ {
4260
+ "epoch": 1.9184652278177459,
4261
+ "eval_loss": 1.379648208618164,
4262
+ "eval_runtime": 21.9766,
4263
+ "eval_samples_per_second": 23.98,
4264
+ "eval_steps_per_second": 6.006,
4265
+ "step": 600
4266
+ },
4267
+ {
4268
+ "epoch": 1.9216626698641086,
4269
+ "grad_norm": 0.4664270281791687,
4270
+ "learning_rate": 8.117081984415298e-07,
4271
+ "loss": 1.2665,
4272
+ "step": 601
4273
+ },
4274
+ {
4275
+ "epoch": 1.9248601119104716,
4276
+ "grad_norm": 0.5185204148292542,
4277
+ "learning_rate": 7.481497508972312e-07,
4278
+ "loss": 1.1728,
4279
+ "step": 602
4280
+ },
4281
+ {
4282
+ "epoch": 1.9280575539568345,
4283
+ "grad_norm": 0.5853208303451538,
4284
+ "learning_rate": 6.871728263089794e-07,
4285
+ "loss": 1.4523,
4286
+ "step": 603
4287
+ },
4288
+ {
4289
+ "epoch": 1.9312549960031973,
4290
+ "grad_norm": 0.497002512216568,
4291
+ "learning_rate": 6.287790106757396e-07,
4292
+ "loss": 1.3046,
4293
+ "step": 604
4294
+ },
4295
+ {
4296
+ "epoch": 1.9344524380495605,
4297
+ "grad_norm": 0.6803504824638367,
4298
+ "learning_rate": 5.729698228102653e-07,
4299
+ "loss": 1.2796,
4300
+ "step": 605
4301
+ },
4302
+ {
4303
+ "epoch": 1.9376498800959232,
4304
+ "grad_norm": 0.4755241870880127,
4305
+ "learning_rate": 5.19746714299596e-07,
4306
+ "loss": 1.2355,
4307
+ "step": 606
4308
+ },
4309
+ {
4310
+ "epoch": 1.9408473221422862,
4311
+ "grad_norm": 0.4982335567474365,
4312
+ "learning_rate": 4.691110694673095e-07,
4313
+ "loss": 1.367,
4314
+ "step": 607
4315
+ },
4316
+ {
4317
+ "epoch": 1.9440447641886491,
4318
+ "grad_norm": 0.42912325263023376,
4319
+ "learning_rate": 4.210642053375069e-07,
4320
+ "loss": 1.2386,
4321
+ "step": 608
4322
+ },
4323
+ {
4324
+ "epoch": 1.9472422062350119,
4325
+ "grad_norm": 0.47803112864494324,
4326
+ "learning_rate": 3.756073716005837e-07,
4327
+ "loss": 1.2978,
4328
+ "step": 609
4329
+ },
4330
+ {
4331
+ "epoch": 1.950439648281375,
4332
+ "grad_norm": 0.49415600299835205,
4333
+ "learning_rate": 3.3274175058067846e-07,
4334
+ "loss": 1.162,
4335
+ "step": 610
4336
+ },
4337
+ {
4338
+ "epoch": 1.9536370903277378,
4339
+ "grad_norm": 0.4752744436264038,
4340
+ "learning_rate": 2.9246845720496407e-07,
4341
+ "loss": 1.2916,
4342
+ "step": 611
4343
+ },
4344
+ {
4345
+ "epoch": 1.9568345323741008,
4346
+ "grad_norm": 0.4329509735107422,
4347
+ "learning_rate": 2.547885389746485e-07,
4348
+ "loss": 1.2092,
4349
+ "step": 612
4350
+ },
4351
+ {
4352
+ "epoch": 1.9600319744204637,
4353
+ "grad_norm": 0.48668840527534485,
4354
+ "learning_rate": 2.1970297593767453e-07,
4355
+ "loss": 1.4974,
4356
+ "step": 613
4357
+ },
4358
+ {
4359
+ "epoch": 1.9632294164668265,
4360
+ "grad_norm": 0.4879298806190491,
4361
+ "learning_rate": 1.8721268066330676e-07,
4362
+ "loss": 1.2918,
4363
+ "step": 614
4364
+ },
4365
+ {
4366
+ "epoch": 1.9664268585131894,
4367
+ "grad_norm": 0.4559149146080017,
4368
+ "learning_rate": 1.5731849821833954e-07,
4369
+ "loss": 1.1624,
4370
+ "step": 615
4371
+ },
4372
+ {
4373
+ "epoch": 1.9696243005595524,
4374
+ "grad_norm": 0.541537880897522,
4375
+ "learning_rate": 1.300212061451367e-07,
4376
+ "loss": 1.3551,
4377
+ "step": 616
4378
+ },
4379
+ {
4380
+ "epoch": 1.9728217426059151,
4381
+ "grad_norm": 0.5082891583442688,
4382
+ "learning_rate": 1.0532151444140326e-07,
4383
+ "loss": 1.2856,
4384
+ "step": 617
4385
+ },
4386
+ {
4387
+ "epoch": 1.9760191846522783,
4388
+ "grad_norm": 0.4802534580230713,
4389
+ "learning_rate": 8.322006554171146e-08,
4390
+ "loss": 1.2623,
4391
+ "step": 618
4392
+ },
4393
+ {
4394
+ "epoch": 1.979216626698641,
4395
+ "grad_norm": 0.4262090027332306,
4396
+ "learning_rate": 6.37174343008251e-08,
4397
+ "loss": 0.9914,
4398
+ "step": 619
4399
+ },
4400
+ {
4401
+ "epoch": 1.982414068745004,
4402
+ "grad_norm": 0.5872986912727356,
4403
+ "learning_rate": 4.6814127978722644e-08,
4404
+ "loss": 1.3731,
4405
+ "step": 620
4406
+ },
4407
+ {
4408
+ "epoch": 1.985611510791367,
4409
+ "grad_norm": 0.5640340447425842,
4410
+ "learning_rate": 3.251058622737446e-08,
4411
+ "loss": 1.5329,
4412
+ "step": 621
4413
+ },
4414
+ {
4415
+ "epoch": 1.9888089528377297,
4416
+ "grad_norm": 0.5046076774597168,
4417
+ "learning_rate": 2.080718107935198e-08,
4418
+ "loss": 1.4028,
4419
+ "step": 622
4420
+ },
4421
+ {
4422
+ "epoch": 1.9920063948840927,
4423
+ "grad_norm": 0.4681106209754944,
4424
+ "learning_rate": 1.1704216938146496e-08,
4425
+ "loss": 1.0768,
4426
+ "step": 623
4427
+ },
4428
+ {
4429
+ "epoch": 1.9952038369304557,
4430
+ "grad_norm": 0.5022634863853455,
4431
+ "learning_rate": 5.201930570242208e-09,
4432
+ "loss": 1.255,
4433
+ "step": 624
4434
+ },
4435
+ {
4436
+ "epoch": 1.9984012789768184,
4437
+ "grad_norm": 0.4953170716762543,
4438
+ "learning_rate": 1.3004910989433682e-09,
4439
+ "loss": 1.4848,
4440
+ "step": 625
4441
+ },
4442
+ {
4443
+ "epoch": 2.0015987210231816,
4444
+ "grad_norm": 0.49646100401878357,
4445
+ "learning_rate": 0.0,
4446
+ "loss": 1.2531,
4447
+ "step": 626
4448
  }
4449
  ],
4450
  "logging_steps": 1,
 
4468
  "should_evaluate": false,
4469
  "should_log": false,
4470
  "should_save": true,
4471
+ "should_training_stop": true
4472
  },
4473
  "attributes": {}
4474
  }
4475
  },
4476
+ "total_flos": 3.0168043503589786e+17,
4477
  "train_batch_size": 4,
4478
  "trial_name": null,
4479
  "trial_params": null