secmlr
/

SWE-BENCH-5k-first-2000-claude-search-replace-generation_qwen_code_32B_5k_first_2000_generation

yuzhounie commited on Apr 24

Commit

da9401d

verified ·

1 Parent(s): fee3357

End of training

Browse files

Files changed (5) hide show

README.md +2 -1
all_results.json +8 -0
train_results.json +8 -0
trainer_state.json +1303 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: apache-2.0
 base_model: Qwen/Qwen2.5-Coder-32B-Instruct
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: SWE-BENCH-5k-first-2000-claude-search-replace-generation_qwen_code_32B_5k_first_2000_generation
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # SWE-BENCH-5k-first-2000-claude-search-replace-generation_qwen_code_32B_5k_first_2000_generation
-This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) on an unknown dataset.
 ## Model description

 base_model: Qwen/Qwen2.5-Coder-32B-Instruct
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: SWE-BENCH-5k-first-2000-claude-search-replace-generation_qwen_code_32B_5k_first_2000_generation
 # SWE-BENCH-5k-first-2000-claude-search-replace-generation_qwen_code_32B_5k_first_2000_generation
+This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) on the SWE-BENCH-5k-first-2000-claude-search-replace-generation dataset.
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.96398891966759,
+    "total_flos": 6.743893969836442e+16,
+    "train_loss": 0.3866574793226189,
+    "train_runtime": 24143.75,
+    "train_samples_per_second": 0.179,
+    "train_steps_per_second": 0.007
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.96398891966759,
+    "total_flos": 6.743893969836442e+16,
+    "train_loss": 0.3866574793226189,
+    "train_runtime": 24143.75,
+    "train_samples_per_second": 0.179,
+    "train_steps_per_second": 0.007
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1303 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.96398891966759,
+  "eval_steps": 500,
+  "global_step": 180,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01662049861495845,
+      "grad_norm": 1.9652302265167236,
+      "learning_rate": 0.0,
+      "loss": 0.7152,
+      "step": 1
+    },
+    {
+      "epoch": 0.0332409972299169,
+      "grad_norm": 2.135629177093506,
+      "learning_rate": 5.555555555555555e-07,
+      "loss": 0.7024,
+      "step": 2
+    },
+    {
+      "epoch": 0.04986149584487535,
+      "grad_norm": 2.365844964981079,
+      "learning_rate": 1.111111111111111e-06,
+      "loss": 0.7755,
+      "step": 3
+    },
+    {
+      "epoch": 0.0664819944598338,
+      "grad_norm": 1.939900517463684,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.7134,
+      "step": 4
+    },
+    {
+      "epoch": 0.08310249307479224,
+      "grad_norm": 1.8507870435714722,
+      "learning_rate": 2.222222222222222e-06,
+      "loss": 0.6644,
+      "step": 5
+    },
+    {
+      "epoch": 0.0997229916897507,
+      "grad_norm": 1.8390847444534302,
+      "learning_rate": 2.7777777777777783e-06,
+      "loss": 0.7306,
+      "step": 6
+    },
+    {
+      "epoch": 0.11634349030470914,
+      "grad_norm": 1.2149966955184937,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.5377,
+      "step": 7
+    },
+    {
+      "epoch": 0.1329639889196676,
+      "grad_norm": 1.203329086303711,
+      "learning_rate": 3.88888888888889e-06,
+      "loss": 0.6448,
+      "step": 8
+    },
+    {
+      "epoch": 0.14958448753462603,
+      "grad_norm": 1.1259090900421143,
+      "learning_rate": 4.444444444444444e-06,
+      "loss": 0.6041,
+      "step": 9
+    },
+    {
+      "epoch": 0.16620498614958448,
+      "grad_norm": 0.9785488247871399,
+      "learning_rate": 5e-06,
+      "loss": 0.6802,
+      "step": 10
+    },
+    {
+      "epoch": 0.18282548476454294,
+      "grad_norm": 0.7702904343605042,
+      "learning_rate": 5.555555555555557e-06,
+      "loss": 0.5737,
+      "step": 11
+    },
+    {
+      "epoch": 0.1994459833795014,
+      "grad_norm": 0.7972448468208313,
+      "learning_rate": 6.111111111111112e-06,
+      "loss": 0.6071,
+      "step": 12
+    },
+    {
+      "epoch": 0.21606648199445982,
+      "grad_norm": 0.8643639087677002,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.5645,
+      "step": 13
+    },
+    {
+      "epoch": 0.23268698060941828,
+      "grad_norm": 0.822340190410614,
+      "learning_rate": 7.222222222222223e-06,
+      "loss": 0.5512,
+      "step": 14
+    },
+    {
+      "epoch": 0.24930747922437674,
+      "grad_norm": 1.0604660511016846,
+      "learning_rate": 7.77777777777778e-06,
+      "loss": 0.5875,
+      "step": 15
+    },
+    {
+      "epoch": 0.2659279778393352,
+      "grad_norm": 0.8126739263534546,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.5601,
+      "step": 16
+    },
+    {
+      "epoch": 0.28254847645429365,
+      "grad_norm": 0.7240079641342163,
+      "learning_rate": 8.888888888888888e-06,
+      "loss": 0.5724,
+      "step": 17
+    },
+    {
+      "epoch": 0.29916897506925205,
+      "grad_norm": 0.6566236615180969,
+      "learning_rate": 9.444444444444445e-06,
+      "loss": 0.5535,
+      "step": 18
+    },
+    {
+      "epoch": 0.3157894736842105,
+      "grad_norm": 0.7229272723197937,
+      "learning_rate": 1e-05,
+      "loss": 0.5413,
+      "step": 19
+    },
+    {
+      "epoch": 0.33240997229916897,
+      "grad_norm": 0.6160261034965515,
+      "learning_rate": 9.999059852242508e-06,
+      "loss": 0.4809,
+      "step": 20
+    },
+    {
+      "epoch": 0.3490304709141274,
+      "grad_norm": 0.5426657199859619,
+      "learning_rate": 9.996239762521152e-06,
+      "loss": 0.4453,
+      "step": 21
+    },
+    {
+      "epoch": 0.3656509695290859,
+      "grad_norm": 0.6986624002456665,
+      "learning_rate": 9.991540791356342e-06,
+      "loss": 0.5704,
+      "step": 22
+    },
+    {
+      "epoch": 0.38227146814404434,
+      "grad_norm": 0.6466948986053467,
+      "learning_rate": 9.98496470583896e-06,
+      "loss": 0.5222,
+      "step": 23
+    },
+    {
+      "epoch": 0.3988919667590028,
+      "grad_norm": 0.5881003141403198,
+      "learning_rate": 9.976513978965829e-06,
+      "loss": 0.4903,
+      "step": 24
+    },
+    {
+      "epoch": 0.4155124653739612,
+      "grad_norm": 0.5835773348808289,
+      "learning_rate": 9.966191788709716e-06,
+      "loss": 0.4936,
+      "step": 25
+    },
+    {
+      "epoch": 0.43213296398891965,
+      "grad_norm": 0.5974717736244202,
+      "learning_rate": 9.954002016824226e-06,
+      "loss": 0.544,
+      "step": 26
+    },
+    {
+      "epoch": 0.4487534626038781,
+      "grad_norm": 0.6126233339309692,
+      "learning_rate": 9.939949247384046e-06,
+      "loss": 0.5313,
+      "step": 27
+    },
+    {
+      "epoch": 0.46537396121883656,
+      "grad_norm": 0.5605891942977905,
+      "learning_rate": 9.924038765061042e-06,
+      "loss": 0.5121,
+      "step": 28
+    },
+    {
+      "epoch": 0.481994459833795,
+      "grad_norm": 0.523395299911499,
+      "learning_rate": 9.906276553136924e-06,
+      "loss": 0.4705,
+      "step": 29
+    },
+    {
+      "epoch": 0.4986149584487535,
+      "grad_norm": 0.5597982406616211,
+      "learning_rate": 9.886669291253178e-06,
+      "loss": 0.4951,
+      "step": 30
+    },
+    {
+      "epoch": 0.5152354570637119,
+      "grad_norm": 0.5273374915122986,
+      "learning_rate": 9.86522435289912e-06,
+      "loss": 0.4763,
+      "step": 31
+    },
+    {
+      "epoch": 0.5318559556786704,
+      "grad_norm": 0.5255304574966431,
+      "learning_rate": 9.841949802639031e-06,
+      "loss": 0.5133,
+      "step": 32
+    },
+    {
+      "epoch": 0.5484764542936288,
+      "grad_norm": 0.8223831057548523,
+      "learning_rate": 9.816854393079402e-06,
+      "loss": 0.4865,
+      "step": 33
+    },
+    {
+      "epoch": 0.5650969529085873,
+      "grad_norm": 0.4619203805923462,
+      "learning_rate": 9.789947561577445e-06,
+      "loss": 0.4631,
+      "step": 34
+    },
+    {
+      "epoch": 0.5817174515235457,
+      "grad_norm": 0.4974648654460907,
+      "learning_rate": 9.761239426692077e-06,
+      "loss": 0.5039,
+      "step": 35
+    },
+    {
+      "epoch": 0.5983379501385041,
+      "grad_norm": 0.5178198218345642,
+      "learning_rate": 9.730740784378755e-06,
+      "loss": 0.4618,
+      "step": 36
+    },
+    {
+      "epoch": 0.6149584487534626,
+      "grad_norm": 0.5592218637466431,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 0.4777,
+      "step": 37
+    },
+    {
+      "epoch": 0.631578947368421,
+      "grad_norm": 0.4956098198890686,
+      "learning_rate": 9.664418523660004e-06,
+      "loss": 0.4925,
+      "step": 38
+    },
+    {
+      "epoch": 0.6481994459833795,
+      "grad_norm": 0.48805150389671326,
+      "learning_rate": 9.628619846344453e-06,
+      "loss": 0.4423,
+      "step": 39
+    },
+    {
+      "epoch": 0.6648199445983379,
+      "grad_norm": 0.5749639868736267,
+      "learning_rate": 9.591080534401371e-06,
+      "loss": 0.55,
+      "step": 40
+    },
+    {
+      "epoch": 0.6814404432132964,
+      "grad_norm": 0.7393980622291565,
+      "learning_rate": 9.551814704830734e-06,
+      "loss": 0.426,
+      "step": 41
+    },
+    {
+      "epoch": 0.6980609418282548,
+      "grad_norm": 0.5011327862739563,
+      "learning_rate": 9.51083712390519e-06,
+      "loss": 0.4628,
+      "step": 42
+    },
+    {
+      "epoch": 0.7146814404432132,
+      "grad_norm": 0.572926938533783,
+      "learning_rate": 9.468163201617063e-06,
+      "loss": 0.527,
+      "step": 43
+    },
+    {
+      "epoch": 0.7313019390581718,
+      "grad_norm": 0.5243227481842041,
+      "learning_rate": 9.423808985883289e-06,
+      "loss": 0.5115,
+      "step": 44
+    },
+    {
+      "epoch": 0.7479224376731302,
+      "grad_norm": 0.5271593928337097,
+      "learning_rate": 9.377791156510456e-06,
+      "loss": 0.4921,
+      "step": 45
+    },
+    {
+      "epoch": 0.7645429362880887,
+      "grad_norm": 0.5143831968307495,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.4842,
+      "step": 46
+    },
+    {
+      "epoch": 0.7811634349030471,
+      "grad_norm": 0.5135733485221863,
+      "learning_rate": 9.280834497651334e-06,
+      "loss": 0.4939,
+      "step": 47
+    },
+    {
+      "epoch": 0.7977839335180056,
+      "grad_norm": 0.5173041820526123,
+      "learning_rate": 9.229932129599206e-06,
+      "loss": 0.4819,
+      "step": 48
+    },
+    {
+      "epoch": 0.814404432132964,
+      "grad_norm": 0.570851743221283,
+      "learning_rate": 9.177439057064684e-06,
+      "loss": 0.5439,
+      "step": 49
+    },
+    {
+      "epoch": 0.8310249307479224,
+      "grad_norm": 0.552671492099762,
+      "learning_rate": 9.123375020545534e-06,
+      "loss": 0.4669,
+      "step": 50
+    },
+    {
+      "epoch": 0.8476454293628809,
+      "grad_norm": 0.5668032765388489,
+      "learning_rate": 9.067760351314838e-06,
+      "loss": 0.5138,
+      "step": 51
+    },
+    {
+      "epoch": 0.8642659279778393,
+      "grad_norm": 0.48532989621162415,
+      "learning_rate": 9.01061596377522e-06,
+      "loss": 0.4827,
+      "step": 52
+    },
+    {
+      "epoch": 0.8808864265927978,
+      "grad_norm": 0.4953126311302185,
+      "learning_rate": 8.951963347593797e-06,
+      "loss": 0.4273,
+      "step": 53
+    },
+    {
+      "epoch": 0.8975069252077562,
+      "grad_norm": 0.5042351484298706,
+      "learning_rate": 8.891824559620801e-06,
+      "loss": 0.5311,
+      "step": 54
+    },
+    {
+      "epoch": 0.9141274238227147,
+      "grad_norm": 0.532244086265564,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 0.5364,
+      "step": 55
+    },
+    {
+      "epoch": 0.9307479224376731,
+      "grad_norm": 0.5507211089134216,
+      "learning_rate": 8.767179481638303e-06,
+      "loss": 0.5264,
+      "step": 56
+    },
+    {
+      "epoch": 0.9473684210526315,
+      "grad_norm": 0.5117627382278442,
+      "learning_rate": 8.702720065545024e-06,
+      "loss": 0.4994,
+      "step": 57
+    },
+    {
+      "epoch": 0.96398891966759,
+      "grad_norm": 0.6424684524536133,
+      "learning_rate": 8.636868207865244e-06,
+      "loss": 0.5321,
+      "step": 58
+    },
+    {
+      "epoch": 0.9806094182825484,
+      "grad_norm": 0.5632804036140442,
+      "learning_rate": 8.569648672789496e-06,
+      "loss": 0.5354,
+      "step": 59
+    },
+    {
+      "epoch": 0.997229916897507,
+      "grad_norm": 0.5519580841064453,
+      "learning_rate": 8.501086738835843e-06,
+      "loss": 0.5502,
+      "step": 60
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.5519580841064453,
+      "learning_rate": 8.43120818934367e-06,
+      "loss": 0.4298,
+      "step": 61
+    },
+    {
+      "epoch": 1.0166204986149585,
+      "grad_norm": 1.4024403095245361,
+      "learning_rate": 8.360039302777614e-06,
+      "loss": 0.3848,
+      "step": 62
+    },
+    {
+      "epoch": 1.0332409972299168,
+      "grad_norm": 0.4745033085346222,
+      "learning_rate": 8.28760684284532e-06,
+      "loss": 0.4,
+      "step": 63
+    },
+    {
+      "epoch": 1.0498614958448753,
+      "grad_norm": 0.5079669952392578,
+      "learning_rate": 8.213938048432697e-06,
+      "loss": 0.3824,
+      "step": 64
+    },
+    {
+      "epoch": 1.0664819944598338,
+      "grad_norm": 0.49697190523147583,
+      "learning_rate": 8.139060623360494e-06,
+      "loss": 0.4243,
+      "step": 65
+    },
+    {
+      "epoch": 1.0831024930747923,
+      "grad_norm": 0.4616394639015198,
+      "learning_rate": 8.063002725966014e-06,
+      "loss": 0.3888,
+      "step": 66
+    },
+    {
+      "epoch": 1.0997229916897506,
+      "grad_norm": 0.4260391294956207,
+      "learning_rate": 7.985792958513932e-06,
+      "loss": 0.3406,
+      "step": 67
+    },
+    {
+      "epoch": 1.1163434903047091,
+      "grad_norm": 0.47153493762016296,
+      "learning_rate": 7.907460356440133e-06,
+      "loss": 0.3636,
+      "step": 68
+    },
+    {
+      "epoch": 1.1329639889196677,
+      "grad_norm": 0.5076174139976501,
+      "learning_rate": 7.828034377432694e-06,
+      "loss": 0.4166,
+      "step": 69
+    },
+    {
+      "epoch": 1.149584487534626,
+      "grad_norm": 0.5310080647468567,
+      "learning_rate": 7.747544890354031e-06,
+      "loss": 0.4311,
+      "step": 70
+    },
+    {
+      "epoch": 1.1662049861495845,
+      "grad_norm": 0.5010002851486206,
+      "learning_rate": 7.666022164008458e-06,
+      "loss": 0.3193,
+      "step": 71
+    },
+    {
+      "epoch": 1.182825484764543,
+      "grad_norm": 0.49259936809539795,
+      "learning_rate": 7.5834968557593155e-06,
+      "loss": 0.3456,
+      "step": 72
+    },
+    {
+      "epoch": 1.1994459833795015,
+      "grad_norm": 0.5213885307312012,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.3615,
+      "step": 73
+    },
+    {
+      "epoch": 1.2160664819944598,
+      "grad_norm": 0.512752115726471,
+      "learning_rate": 7.415562996483193e-06,
+      "loss": 0.3569,
+      "step": 74
+    },
+    {
+      "epoch": 1.2326869806094183,
+      "grad_norm": 0.5139035582542419,
+      "learning_rate": 7.330217598512696e-06,
+      "loss": 0.3859,
+      "step": 75
+    },
+    {
+      "epoch": 1.2493074792243768,
+      "grad_norm": 0.5561084151268005,
+      "learning_rate": 7.243995901002312e-06,
+      "loss": 0.363,
+      "step": 76
+    },
+    {
+      "epoch": 1.2659279778393353,
+      "grad_norm": 0.49844229221343994,
+      "learning_rate": 7.156930328406268e-06,
+      "loss": 0.3648,
+      "step": 77
+    },
+    {
+      "epoch": 1.2825484764542936,
+      "grad_norm": 0.5111745595932007,
+      "learning_rate": 7.069053622525697e-06,
+      "loss": 0.3453,
+      "step": 78
+    },
+    {
+      "epoch": 1.299168975069252,
+      "grad_norm": 0.5968831777572632,
+      "learning_rate": 6.980398830195785e-06,
+      "loss": 0.3601,
+      "step": 79
+    },
+    {
+      "epoch": 1.3157894736842106,
+      "grad_norm": 0.3998188376426697,
+      "learning_rate": 6.890999290858213e-06,
+      "loss": 0.2965,
+      "step": 80
+    },
+    {
+      "epoch": 1.332409972299169,
+      "grad_norm": 0.5044348239898682,
+      "learning_rate": 6.800888624023552e-06,
+      "loss": 0.3579,
+      "step": 81
+    },
+    {
+      "epoch": 1.3490304709141274,
+      "grad_norm": 0.499636709690094,
+      "learning_rate": 6.710100716628345e-06,
+      "loss": 0.3751,
+      "step": 82
+    },
+    {
+      "epoch": 1.365650969529086,
+      "grad_norm": 0.5045871734619141,
+      "learning_rate": 6.618669710291607e-06,
+      "loss": 0.3782,
+      "step": 83
+    },
+    {
+      "epoch": 1.3822714681440442,
+      "grad_norm": 0.5296726822853088,
+      "learning_rate": 6.526629988475567e-06,
+      "loss": 0.413,
+      "step": 84
+    },
+    {
+      "epoch": 1.3988919667590027,
+      "grad_norm": 0.5541542768478394,
+      "learning_rate": 6.434016163555452e-06,
+      "loss": 0.4176,
+      "step": 85
+    },
+    {
+      "epoch": 1.4155124653739612,
+      "grad_norm": 0.52264803647995,
+      "learning_rate": 6.340863063803187e-06,
+      "loss": 0.3687,
+      "step": 86
+    },
+    {
+      "epoch": 1.4321329639889195,
+      "grad_norm": 0.5726013779640198,
+      "learning_rate": 6.247205720289907e-06,
+      "loss": 0.4127,
+      "step": 87
+    },
+    {
+      "epoch": 1.448753462603878,
+      "grad_norm": 0.5129911303520203,
+      "learning_rate": 6.153079353712201e-06,
+      "loss": 0.3608,
+      "step": 88
+    },
+    {
+      "epoch": 1.4653739612188366,
+      "grad_norm": 0.5869404673576355,
+      "learning_rate": 6.058519361147055e-06,
+      "loss": 0.369,
+      "step": 89
+    },
+    {
+      "epoch": 1.481994459833795,
+      "grad_norm": 0.4603992998600006,
+      "learning_rate": 5.9635613027404495e-06,
+      "loss": 0.2792,
+      "step": 90
+    },
+    {
+      "epoch": 1.4986149584487536,
+      "grad_norm": 0.433829128742218,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 0.2935,
+      "step": 91
+    },
+    {
+      "epoch": 1.5152354570637119,
+      "grad_norm": 0.4892548620700836,
+      "learning_rate": 5.772593964039203e-06,
+      "loss": 0.3591,
+      "step": 92
+    },
+    {
+      "epoch": 1.5318559556786704,
+      "grad_norm": 0.4414325952529907,
+      "learning_rate": 5.6766564987506564e-06,
+      "loss": 0.3312,
+      "step": 93
+    },
+    {
+      "epoch": 1.548476454293629,
+      "grad_norm": 0.5104185938835144,
+      "learning_rate": 5.5804645706261515e-06,
+      "loss": 0.3524,
+      "step": 94
+    },
+    {
+      "epoch": 1.5650969529085872,
+      "grad_norm": 0.46491438150405884,
+      "learning_rate": 5.484054353515896e-06,
+      "loss": 0.3127,
+      "step": 95
+    },
+    {
+      "epoch": 1.5817174515235457,
+      "grad_norm": 0.5037529468536377,
+      "learning_rate": 5.387462103359655e-06,
+      "loss": 0.3549,
+      "step": 96
+    },
+    {
+      "epoch": 1.5983379501385042,
+      "grad_norm": 0.456927090883255,
+      "learning_rate": 5.290724144552379e-06,
+      "loss": 0.3583,
+      "step": 97
+    },
+    {
+      "epoch": 1.6149584487534625,
+      "grad_norm": 0.48146891593933105,
+      "learning_rate": 5.193876856284085e-06,
+      "loss": 0.3485,
+      "step": 98
+    },
+    {
+      "epoch": 1.631578947368421,
+      "grad_norm": 0.45695117115974426,
+      "learning_rate": 5.096956658859122e-06,
+      "loss": 0.3325,
+      "step": 99
+    },
+    {
+      "epoch": 1.6481994459833795,
+      "grad_norm": 0.46289077401161194,
+      "learning_rate": 5e-06,
+      "loss": 0.3461,
+      "step": 100
+    },
+    {
+      "epoch": 1.6648199445983378,
+      "grad_norm": 0.5340746641159058,
+      "learning_rate": 4.903043341140879e-06,
+      "loss": 0.3856,
+      "step": 101
+    },
+    {
+      "epoch": 1.6814404432132966,
+      "grad_norm": 0.433956503868103,
+      "learning_rate": 4.806123143715916e-06,
+      "loss": 0.3166,
+      "step": 102
+    },
+    {
+      "epoch": 1.6980609418282548,
+      "grad_norm": 0.4446304440498352,
+      "learning_rate": 4.7092758554476215e-06,
+      "loss": 0.3378,
+      "step": 103
+    },
+    {
+      "epoch": 1.7146814404432131,
+      "grad_norm": 0.5027093291282654,
+      "learning_rate": 4.6125378966403465e-06,
+      "loss": 0.3915,
+      "step": 104
+    },
+    {
+      "epoch": 1.7313019390581719,
+      "grad_norm": 0.5546647310256958,
+      "learning_rate": 4.515945646484105e-06,
+      "loss": 0.3484,
+      "step": 105
+    },
+    {
+      "epoch": 1.7479224376731302,
+      "grad_norm": 0.49674123525619507,
+      "learning_rate": 4.4195354293738484e-06,
+      "loss": 0.3501,
+      "step": 106
+    },
+    {
+      "epoch": 1.7645429362880887,
+      "grad_norm": 0.5134773850440979,
+      "learning_rate": 4.323343501249346e-06,
+      "loss": 0.3818,
+      "step": 107
+    },
+    {
+      "epoch": 1.7811634349030472,
+      "grad_norm": 0.5111790299415588,
+      "learning_rate": 4.227406035960798e-06,
+      "loss": 0.4027,
+      "step": 108
+    },
+    {
+      "epoch": 1.7977839335180055,
+      "grad_norm": 0.5103554129600525,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 0.3295,
+      "step": 109
+    },
+    {
+      "epoch": 1.814404432132964,
+      "grad_norm": 0.48488280177116394,
+      "learning_rate": 4.036438697259551e-06,
+      "loss": 0.3339,
+      "step": 110
+    },
+    {
+      "epoch": 1.8310249307479225,
+      "grad_norm": 0.4840296506881714,
+      "learning_rate": 3.941480638852948e-06,
+      "loss": 0.3519,
+      "step": 111
+    },
+    {
+      "epoch": 1.8476454293628808,
+      "grad_norm": 0.4919949471950531,
+      "learning_rate": 3.8469206462878e-06,
+      "loss": 0.328,
+      "step": 112
+    },
+    {
+      "epoch": 1.8642659279778393,
+      "grad_norm": 0.5291365385055542,
+      "learning_rate": 3.752794279710094e-06,
+      "loss": 0.3753,
+      "step": 113
+    },
+    {
+      "epoch": 1.8808864265927978,
+      "grad_norm": 0.4807715117931366,
+      "learning_rate": 3.6591369361968127e-06,
+      "loss": 0.393,
+      "step": 114
+    },
+    {
+      "epoch": 1.897506925207756,
+      "grad_norm": 0.4700012803077698,
+      "learning_rate": 3.5659838364445505e-06,
+      "loss": 0.3182,
+      "step": 115
+    },
+    {
+      "epoch": 1.9141274238227148,
+      "grad_norm": 1.0692706108093262,
+      "learning_rate": 3.473370011524435e-06,
+      "loss": 0.3463,
+      "step": 116
+    },
+    {
+      "epoch": 1.9307479224376731,
+      "grad_norm": 0.49183958768844604,
+      "learning_rate": 3.3813302897083955e-06,
+      "loss": 0.3694,
+      "step": 117
+    },
+    {
+      "epoch": 1.9473684210526314,
+      "grad_norm": 0.5577133893966675,
+      "learning_rate": 3.289899283371657e-06,
+      "loss": 0.3693,
+      "step": 118
+    },
+    {
+      "epoch": 1.9639889196675901,
+      "grad_norm": 0.47118237614631653,
+      "learning_rate": 3.1991113759764493e-06,
+      "loss": 0.3325,
+      "step": 119
+    },
+    {
+      "epoch": 1.9806094182825484,
+      "grad_norm": 0.44954901933670044,
+      "learning_rate": 3.1090007091417884e-06,
+      "loss": 0.3497,
+      "step": 120
+    },
+    {
+      "epoch": 1.997229916897507,
+      "grad_norm": 0.5316449403762817,
+      "learning_rate": 3.019601169804216e-06,
+      "loss": 0.4239,
+      "step": 121
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.5316449403762817,
+      "learning_rate": 2.9309463774743047e-06,
+      "loss": 0.302,
+      "step": 122
+    },
+    {
+      "epoch": 2.0166204986149583,
+      "grad_norm": 1.3086326122283936,
+      "learning_rate": 2.843069671593734e-06,
+      "loss": 0.2255,
+      "step": 123
+    },
+    {
+      "epoch": 2.033240997229917,
+      "grad_norm": 0.4746488928794861,
+      "learning_rate": 2.7560040989976894e-06,
+      "loss": 0.2275,
+      "step": 124
+    },
+    {
+      "epoch": 2.0498614958448753,
+      "grad_norm": 0.4944143295288086,
+      "learning_rate": 2.6697824014873076e-06,
+      "loss": 0.2648,
+      "step": 125
+    },
+    {
+      "epoch": 2.0664819944598336,
+      "grad_norm": 0.5195774435997009,
+      "learning_rate": 2.5844370035168077e-06,
+      "loss": 0.2707,
+      "step": 126
+    },
+    {
+      "epoch": 2.0831024930747923,
+      "grad_norm": 0.885553240776062,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.2764,
+      "step": 127
+    },
+    {
+      "epoch": 2.0997229916897506,
+      "grad_norm": 0.5028234124183655,
+      "learning_rate": 2.4165031442406857e-06,
+      "loss": 0.2503,
+      "step": 128
+    },
+    {
+      "epoch": 2.1163434903047094,
+      "grad_norm": 0.4780957102775574,
+      "learning_rate": 2.333977835991545e-06,
+      "loss": 0.2406,
+      "step": 129
+    },
+    {
+      "epoch": 2.1329639889196677,
+      "grad_norm": 0.46052825450897217,
+      "learning_rate": 2.2524551096459703e-06,
+      "loss": 0.2155,
+      "step": 130
+    },
+    {
+      "epoch": 2.149584487534626,
+      "grad_norm": 0.6180452704429626,
+      "learning_rate": 2.171965622567308e-06,
+      "loss": 0.2787,
+      "step": 131
+    },
+    {
+      "epoch": 2.1662049861495847,
+      "grad_norm": 0.6939100027084351,
+      "learning_rate": 2.0925396435598665e-06,
+      "loss": 0.246,
+      "step": 132
+    },
+    {
+      "epoch": 2.182825484764543,
+      "grad_norm": 0.6042692065238953,
+      "learning_rate": 2.0142070414860704e-06,
+      "loss": 0.2609,
+      "step": 133
+    },
+    {
+      "epoch": 2.1994459833795013,
+      "grad_norm": 0.7851183414459229,
+      "learning_rate": 1.936997274033986e-06,
+      "loss": 0.2876,
+      "step": 134
+    },
+    {
+      "epoch": 2.21606648199446,
+      "grad_norm": 0.5801565051078796,
+      "learning_rate": 1.8609393766395083e-06,
+      "loss": 0.288,
+      "step": 135
+    },
+    {
+      "epoch": 2.2326869806094183,
+      "grad_norm": 0.5398533940315247,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 0.2958,
+      "step": 136
+    },
+    {
+      "epoch": 2.2493074792243766,
+      "grad_norm": 0.48142921924591064,
+      "learning_rate": 1.7123931571546826e-06,
+      "loss": 0.2506,
+      "step": 137
+    },
+    {
+      "epoch": 2.2659279778393353,
+      "grad_norm": 0.48484477400779724,
+      "learning_rate": 1.639960697222388e-06,
+      "loss": 0.2166,
+      "step": 138
+    },
+    {
+      "epoch": 2.2825484764542936,
+      "grad_norm": 0.4676513075828552,
+      "learning_rate": 1.5687918106563326e-06,
+      "loss": 0.2558,
+      "step": 139
+    },
+    {
+      "epoch": 2.299168975069252,
+      "grad_norm": 0.5008206963539124,
+      "learning_rate": 1.4989132611641576e-06,
+      "loss": 0.2315,
+      "step": 140
+    },
+    {
+      "epoch": 2.3157894736842106,
+      "grad_norm": 0.5055615901947021,
+      "learning_rate": 1.4303513272105057e-06,
+      "loss": 0.278,
+      "step": 141
+    },
+    {
+      "epoch": 2.332409972299169,
+      "grad_norm": 0.5048314332962036,
+      "learning_rate": 1.3631317921347564e-06,
+      "loss": 0.2469,
+      "step": 142
+    },
+    {
+      "epoch": 2.349030470914127,
+      "grad_norm": 0.4561052620410919,
+      "learning_rate": 1.297279934454978e-06,
+      "loss": 0.2363,
+      "step": 143
+    },
+    {
+      "epoch": 2.365650969529086,
+      "grad_norm": 0.4409971237182617,
+      "learning_rate": 1.2328205183616964e-06,
+      "loss": 0.2582,
+      "step": 144
+    },
+    {
+      "epoch": 2.3822714681440442,
+      "grad_norm": 0.5186073780059814,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 0.2354,
+      "step": 145
+    },
+    {
+      "epoch": 2.398891966759003,
+      "grad_norm": 0.4931983947753906,
+      "learning_rate": 1.1081754403792e-06,
+      "loss": 0.2628,
+      "step": 146
+    },
+    {
+      "epoch": 2.4155124653739612,
+      "grad_norm": 0.4725812077522278,
+      "learning_rate": 1.0480366524062041e-06,
+      "loss": 0.2465,
+      "step": 147
+    },
+    {
+      "epoch": 2.4321329639889195,
+      "grad_norm": 0.459830641746521,
+      "learning_rate": 9.893840362247809e-07,
+      "loss": 0.2494,
+      "step": 148
+    },
+    {
+      "epoch": 2.4487534626038783,
+      "grad_norm": 0.45882484316825867,
+      "learning_rate": 9.322396486851626e-07,
+      "loss": 0.2572,
+      "step": 149
+    },
+    {
+      "epoch": 2.4653739612188366,
+      "grad_norm": 0.4628044664859772,
+      "learning_rate": 8.766249794544662e-07,
+      "loss": 0.2473,
+      "step": 150
+    },
+    {
+      "epoch": 2.481994459833795,
+      "grad_norm": 0.43482884764671326,
+      "learning_rate": 8.225609429353187e-07,
+      "loss": 0.2334,
+      "step": 151
+    },
+    {
+      "epoch": 2.4986149584487536,
+      "grad_norm": 0.5092786550521851,
+      "learning_rate": 7.700678704007947e-07,
+      "loss": 0.2464,
+      "step": 152
+    },
+    {
+      "epoch": 2.515235457063712,
+      "grad_norm": 0.5002970695495605,
+      "learning_rate": 7.191655023486682e-07,
+      "loss": 0.2386,
+      "step": 153
+    },
+    {
+      "epoch": 2.5318559556786706,
+      "grad_norm": 0.44085896015167236,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.2231,
+      "step": 154
+    },
+    {
+      "epoch": 2.548476454293629,
+      "grad_norm": 0.4750898480415344,
+      "learning_rate": 6.222088434895462e-07,
+      "loss": 0.2746,
+      "step": 155
+    },
+    {
+      "epoch": 2.565096952908587,
+      "grad_norm": 0.5058760643005371,
+      "learning_rate": 5.76191014116711e-07,
+      "loss": 0.2753,
+      "step": 156
+    },
+    {
+      "epoch": 2.581717451523546,
+      "grad_norm": 0.4807314872741699,
+      "learning_rate": 5.318367983829393e-07,
+      "loss": 0.2295,
+      "step": 157
+    },
+    {
+      "epoch": 2.598337950138504,
+      "grad_norm": 0.4975450336933136,
+      "learning_rate": 4.891628760948114e-07,
+      "loss": 0.2623,
+      "step": 158
+    },
+    {
+      "epoch": 2.6149584487534625,
+      "grad_norm": 0.44517505168914795,
+      "learning_rate": 4.481852951692672e-07,
+      "loss": 0.2505,
+      "step": 159
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.526871919631958,
+      "learning_rate": 4.089194655986306e-07,
+      "loss": 0.2944,
+      "step": 160
+    },
+    {
+      "epoch": 2.6481994459833795,
+      "grad_norm": 0.5860976576805115,
+      "learning_rate": 3.7138015365554834e-07,
+      "loss": 0.2929,
+      "step": 161
+    },
+    {
+      "epoch": 2.664819944598338,
+      "grad_norm": 0.5570012927055359,
+      "learning_rate": 3.355814763399973e-07,
+      "loss": 0.2669,
+      "step": 162
+    },
+    {
+      "epoch": 2.6814404432132966,
+      "grad_norm": 0.46305856108665466,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 0.2464,
+      "step": 163
+    },
+    {
+      "epoch": 2.698060941828255,
+      "grad_norm": 0.49931517243385315,
+      "learning_rate": 2.6925921562124867e-07,
+      "loss": 0.233,
+      "step": 164
+    },
+    {
+      "epoch": 2.714681440443213,
+      "grad_norm": 0.4253719449043274,
+      "learning_rate": 2.3876057330792344e-07,
+      "loss": 0.2115,
+      "step": 165
+    },
+    {
+      "epoch": 2.731301939058172,
+      "grad_norm": 0.46956562995910645,
+      "learning_rate": 2.1005243842255552e-07,
+      "loss": 0.2419,
+      "step": 166
+    },
+    {
+      "epoch": 2.74792243767313,
+      "grad_norm": 0.47405821084976196,
+      "learning_rate": 1.8314560692059836e-07,
+      "loss": 0.2442,
+      "step": 167
+    },
+    {
+      "epoch": 2.7645429362880884,
+      "grad_norm": 0.5373594164848328,
+      "learning_rate": 1.5805019736097105e-07,
+      "loss": 0.304,
+      "step": 168
+    },
+    {
+      "epoch": 2.781163434903047,
+      "grad_norm": 0.49911409616470337,
+      "learning_rate": 1.3477564710088097e-07,
+      "loss": 0.2604,
+      "step": 169
+    },
+    {
+      "epoch": 2.7977839335180055,
+      "grad_norm": 0.524211585521698,
+      "learning_rate": 1.1333070874682217e-07,
+      "loss": 0.2319,
+      "step": 170
+    },
+    {
+      "epoch": 2.8144044321329638,
+      "grad_norm": 0.49799832701683044,
+      "learning_rate": 9.372344686307655e-08,
+      "loss": 0.2648,
+      "step": 171
+    },
+    {
+      "epoch": 2.8310249307479225,
+      "grad_norm": 0.4979800581932068,
+      "learning_rate": 7.59612349389599e-08,
+      "loss": 0.2671,
+      "step": 172
+    },
+    {
+      "epoch": 2.847645429362881,
+      "grad_norm": 0.5030661225318909,
+      "learning_rate": 6.005075261595495e-08,
+      "loss": 0.2219,
+      "step": 173
+    },
+    {
+      "epoch": 2.864265927977839,
+      "grad_norm": 0.4839530885219574,
+      "learning_rate": 4.599798317577342e-08,
+      "loss": 0.2981,
+      "step": 174
+    },
+    {
+      "epoch": 2.880886426592798,
+      "grad_norm": 0.49113729596138,
+      "learning_rate": 3.3808211290284886e-08,
+      "loss": 0.2574,
+      "step": 175
+    },
+    {
+      "epoch": 2.897506925207756,
+      "grad_norm": 0.5154249668121338,
+      "learning_rate": 2.3486021034170857e-08,
+      "loss": 0.2584,
+      "step": 176
+    },
+    {
+      "epoch": 2.914127423822715,
+      "grad_norm": 0.46952885389328003,
+      "learning_rate": 1.5035294161039882e-08,
+      "loss": 0.2785,
+      "step": 177
+    },
+    {
+      "epoch": 2.930747922437673,
+      "grad_norm": 0.49860695004463196,
+      "learning_rate": 8.459208643659122e-09,
+      "loss": 0.2572,
+      "step": 178
+    },
+    {
+      "epoch": 2.9473684210526314,
+      "grad_norm": 0.5341483354568481,
+      "learning_rate": 3.760237478849793e-09,
+      "loss": 0.2964,
+      "step": 179
+    },
+    {
+      "epoch": 2.96398891966759,
+      "grad_norm": 0.5575993061065674,
+      "learning_rate": 9.401477574932927e-10,
+      "loss": 0.2896,
+      "step": 180
+    },
+    {
+      "epoch": 2.96398891966759,
+      "step": 180,
+      "total_flos": 6.743893969836442e+16,
+      "train_loss": 0.3866574793226189,
+      "train_runtime": 24143.75,
+      "train_samples_per_second": 0.179,
+      "train_steps_per_second": 0.007
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 180,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.743893969836442e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_loss.png ADDED Viewed