diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..075622f70ccdaac52936e216eb715f94b5df042f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-70000/trainer_state.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-10000/rng_state_0.pth b/checkpoint-10000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-10000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-10000/rng_state_2.pth b/checkpoint-10000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-10000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-10000/rng_state_4.pth b/checkpoint-10000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-10000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-10000/rng_state_5.pth b/checkpoint-10000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-10000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d8c05137280460c6ba50e01710b38e57f99f02f --- /dev/null +++ b/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd276f8c4fd161a442246f1759e58dc9343cc71679253a7d2eae16a679cd05a1 +size 1064 diff --git a/checkpoint-100000/rng_state_0.pth b/checkpoint-100000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-100000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-100000/rng_state_7.pth b/checkpoint-100000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-100000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-100000/scheduler.pt b/checkpoint-100000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48ff877e701a92018a5f8c85058edee92bf1d9a0 --- /dev/null +++ b/checkpoint-100000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f6dff38bac6ab395fe555b7fd9eefc1cb9bb98c93653b9276dea7e356b81c41 +size 1064 diff --git a/checkpoint-110000/rng_state_0.pth b/checkpoint-110000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-110000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-110000/rng_state_2.pth b/checkpoint-110000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-110000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-110000/scheduler.pt b/checkpoint-110000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..79bc8848540d0ac0db8216dc6b7942042a322e73 --- /dev/null +++ b/checkpoint-110000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef9802592b36440c0ad1ca18b6819d140b9aa9741cd69c4d5c571ec4ca397cb +size 1064 diff --git a/checkpoint-120000/rng_state_0.pth b/checkpoint-120000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-120000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-120000/rng_state_2.pth b/checkpoint-120000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-120000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-120000/rng_state_5.pth b/checkpoint-120000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-120000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-120000/scheduler.pt b/checkpoint-120000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..46e0058a10212500f8a0833a92c9cb24453b2b68 --- /dev/null +++ b/checkpoint-120000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a165a037839c66fd4aab918390999d1fa4e1a9869261402a32ad15bc1aec52 +size 1064 diff --git a/checkpoint-130000/rng_state_0.pth b/checkpoint-130000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-130000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-130000/rng_state_2.pth b/checkpoint-130000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-130000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-130000/rng_state_7.pth b/checkpoint-130000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-130000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-130000/scheduler.pt b/checkpoint-130000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..12ae674a1d5197f618bfb446a86037aa3cd07b99 --- /dev/null +++ b/checkpoint-130000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8ba3a50230690fd24594961f4ebfcfbc9a22ce902fd23353d720c8f11c30ab +size 1064 diff --git a/checkpoint-140000/rng_state_0.pth b/checkpoint-140000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-140000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-140000/rng_state_1.pth b/checkpoint-140000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-140000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-140000/rng_state_2.pth b/checkpoint-140000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-140000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-140000/rng_state_6.pth b/checkpoint-140000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-140000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-140000/scheduler.pt b/checkpoint-140000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..470403ec5333158c6b5572e2588d558defe2e857 --- /dev/null +++ b/checkpoint-140000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc93506a1733793576e69f2256e352b938674aa83c9b33046f7c437716a31631 +size 1064 diff --git a/checkpoint-1488/rng_state_0.pth b/checkpoint-1488/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-1488/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-1488/rng_state_2.pth b/checkpoint-1488/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-1488/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-1488/rng_state_6.pth b/checkpoint-1488/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-1488/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-1488/scheduler.pt b/checkpoint-1488/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..30136f597900cc33d53e0862340f0113ad2e8b89 --- /dev/null +++ b/checkpoint-1488/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc91124f57f5198cb7ab9d1073c6b8983ec8e34f2b3ac98574c9bd44030c976 +size 1064 diff --git a/checkpoint-150000/rng_state_1.pth b/checkpoint-150000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-150000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-150000/rng_state_3.pth b/checkpoint-150000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-150000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-150000/rng_state_5.pth b/checkpoint-150000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-150000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-150000/rng_state_6.pth b/checkpoint-150000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-150000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-150000/scheduler.pt b/checkpoint-150000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..410bb9ae81e41d97dbd5bd0ad27cbebb2819132c --- /dev/null +++ b/checkpoint-150000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14e08313d54cd0118acfd96973ae3ce15247749b3d4f787c03954fe9522a9609 +size 1064 diff --git a/checkpoint-160000/rng_state_0.pth b/checkpoint-160000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-160000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-160000/rng_state_1.pth b/checkpoint-160000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-160000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-160000/rng_state_4.pth b/checkpoint-160000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-160000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-160000/rng_state_6.pth b/checkpoint-160000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-160000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-160000/scheduler.pt b/checkpoint-160000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e51bf35e66ea0b2665a4d2f384883d3d45fcfdde --- /dev/null +++ b/checkpoint-160000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21c34a7b7a98127a12cfc16cbfc44858426de5832ba623601b403e9857280794 +size 1064 diff --git a/checkpoint-170000/rng_state_3.pth b/checkpoint-170000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-170000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-170000/rng_state_4.pth b/checkpoint-170000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-170000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-170000/rng_state_5.pth b/checkpoint-170000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-170000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-170000/scheduler.pt b/checkpoint-170000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..75614c897d19e6dd25706089dde0a0eaac7b04d6 --- /dev/null +++ b/checkpoint-170000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:645c1296b9a9482c444f56e43d15bf87a100544f32099853e45d2a131de3f29c +size 1064 diff --git a/checkpoint-180000/rng_state_0.pth b/checkpoint-180000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-180000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-180000/rng_state_1.pth b/checkpoint-180000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-180000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-180000/rng_state_2.pth b/checkpoint-180000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-180000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-180000/rng_state_3.pth b/checkpoint-180000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-180000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-180000/rng_state_4.pth b/checkpoint-180000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-180000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-180000/rng_state_5.pth b/checkpoint-180000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-180000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-180000/rng_state_6.pth b/checkpoint-180000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-180000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-180000/rng_state_7.pth b/checkpoint-180000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-180000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-180000/scheduler.pt b/checkpoint-180000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7315b4a686bab71f8ae35521fbbcc717c5716103 --- /dev/null +++ b/checkpoint-180000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea9263238f4c9a4bf392a1c90c2abe8da76e7f1539863b2777879ab0c91c95b +size 1064 diff --git a/checkpoint-190000/rng_state_0.pth b/checkpoint-190000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-190000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-190000/rng_state_1.pth b/checkpoint-190000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-190000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-190000/rng_state_2.pth b/checkpoint-190000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-190000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-190000/rng_state_3.pth b/checkpoint-190000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-190000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-190000/rng_state_4.pth b/checkpoint-190000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-190000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-190000/rng_state_5.pth b/checkpoint-190000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-190000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-190000/rng_state_6.pth b/checkpoint-190000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-190000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-190000/rng_state_7.pth b/checkpoint-190000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-190000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-190000/scheduler.pt b/checkpoint-190000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d93eb8686e0e67dea68d941adb300fb3ea842c97 --- /dev/null +++ b/checkpoint-190000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c99e6954ec5b5a2c4ac36620c27747696f0fae609e746c0458f0663ea2f317c +size 1064 diff --git a/checkpoint-20000/rng_state_0.pth b/checkpoint-20000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-20000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-20000/rng_state_2.pth b/checkpoint-20000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-20000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-20000/scheduler.pt b/checkpoint-20000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..15ad81bf16abde0a35b670509fe6b262669368f9 --- /dev/null +++ b/checkpoint-20000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c197fa4ce6f670ccf6d80748d3ca09f395d34a167dabbd2f1e3fba02cc126bc5 +size 1064 diff --git a/checkpoint-200000/config.json b/checkpoint-200000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-200000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-200000/generation_config.json b/checkpoint-200000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-200000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-200000/model.safetensors.index.json b/checkpoint-200000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-200000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-200000/rng_state_0.pth b/checkpoint-200000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-200000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-200000/rng_state_1.pth b/checkpoint-200000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-200000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-200000/rng_state_2.pth b/checkpoint-200000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-200000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-200000/rng_state_4.pth b/checkpoint-200000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-200000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-200000/rng_state_5.pth b/checkpoint-200000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-200000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-200000/rng_state_6.pth b/checkpoint-200000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-200000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-200000/rng_state_7.pth b/checkpoint-200000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-200000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-200000/scheduler.pt b/checkpoint-200000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb80c9696e220c702f6ac2a89a5e27141c897022 --- /dev/null +++ b/checkpoint-200000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40e64155341bdbaaed659d5c32dc2feb985f09d74c15aabd4d9c4b29c49294ba +size 1064 diff --git a/checkpoint-210000/config.json b/checkpoint-210000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-210000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-210000/generation_config.json b/checkpoint-210000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-210000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-210000/model.safetensors.index.json b/checkpoint-210000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-210000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-210000/rng_state_1.pth b/checkpoint-210000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-210000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-210000/rng_state_2.pth b/checkpoint-210000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-210000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-210000/rng_state_3.pth b/checkpoint-210000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-210000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-210000/rng_state_4.pth b/checkpoint-210000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-210000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-210000/rng_state_5.pth b/checkpoint-210000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-210000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-210000/rng_state_6.pth b/checkpoint-210000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-210000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-210000/rng_state_7.pth b/checkpoint-210000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-210000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-210000/scheduler.pt b/checkpoint-210000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca2501657bc9ff09815060cdcdbed1f5e97b1f1d --- /dev/null +++ b/checkpoint-210000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244d3eb66815dd5bdd2f4225a96e6648fad3f436995c4be9ad2a2fa38a552b7c +size 1064 diff --git a/checkpoint-220000/config.json b/checkpoint-220000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-220000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-220000/generation_config.json b/checkpoint-220000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-220000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-220000/model.safetensors.index.json b/checkpoint-220000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-220000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-220000/rng_state_0.pth b/checkpoint-220000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-220000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-220000/rng_state_1.pth b/checkpoint-220000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-220000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-220000/rng_state_2.pth b/checkpoint-220000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-220000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-220000/rng_state_4.pth b/checkpoint-220000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-220000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-220000/rng_state_6.pth b/checkpoint-220000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-220000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-220000/rng_state_7.pth b/checkpoint-220000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-220000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-220000/scheduler.pt b/checkpoint-220000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5828455c0a6560dbcdf20a81c8fe0d8058de8ae1 --- /dev/null +++ b/checkpoint-220000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad4d4aad55b559bf59c9a40d99e0031445d4a00e21d806db7619a9ebdb9d782 +size 1064 diff --git a/checkpoint-230000/config.json b/checkpoint-230000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-230000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-230000/generation_config.json b/checkpoint-230000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-230000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-230000/model.safetensors.index.json b/checkpoint-230000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-230000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-230000/rng_state_0.pth b/checkpoint-230000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-230000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-230000/rng_state_1.pth b/checkpoint-230000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-230000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-230000/rng_state_2.pth b/checkpoint-230000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-230000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-230000/rng_state_3.pth b/checkpoint-230000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-230000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-230000/rng_state_4.pth b/checkpoint-230000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-230000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-230000/rng_state_5.pth b/checkpoint-230000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-230000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-230000/rng_state_6.pth b/checkpoint-230000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-230000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-230000/rng_state_7.pth b/checkpoint-230000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-230000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-230000/scheduler.pt b/checkpoint-230000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb66c178429f922bfe2a5bd3955f8269c5b66517 --- /dev/null +++ b/checkpoint-230000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62492cab81bbc8c93e426c5fd653e81c28910fad67d08937fba3beda8150ece4 +size 1064 diff --git a/checkpoint-240000/config.json b/checkpoint-240000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-240000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-240000/generation_config.json b/checkpoint-240000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-240000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-240000/model.safetensors.index.json b/checkpoint-240000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-240000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-240000/rng_state_0.pth b/checkpoint-240000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-240000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-240000/rng_state_1.pth b/checkpoint-240000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-240000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-240000/rng_state_2.pth b/checkpoint-240000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-240000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-240000/rng_state_3.pth b/checkpoint-240000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-240000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-240000/rng_state_4.pth b/checkpoint-240000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-240000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-240000/rng_state_5.pth b/checkpoint-240000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-240000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-240000/rng_state_6.pth b/checkpoint-240000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-240000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-240000/rng_state_7.pth b/checkpoint-240000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-240000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-240000/scheduler.pt b/checkpoint-240000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d47fe277ba0d9deb3b0bc9c92b8b61137f4ccca --- /dev/null +++ b/checkpoint-240000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b990edf5f4109557f671bb35c00e75231e30500a17598174bd5ecafa4010882 +size 1064 diff --git a/checkpoint-250000/config.json b/checkpoint-250000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-250000/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-250000/generation_config.json b/checkpoint-250000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-250000/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-250000/model.safetensors.index.json b/checkpoint-250000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-250000/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-250000/rng_state_0.pth b/checkpoint-250000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-250000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-250000/rng_state_1.pth b/checkpoint-250000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-250000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-250000/rng_state_2.pth b/checkpoint-250000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-250000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-250000/rng_state_3.pth b/checkpoint-250000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-250000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-250000/rng_state_4.pth b/checkpoint-250000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-250000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-250000/rng_state_5.pth b/checkpoint-250000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-250000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-250000/rng_state_6.pth b/checkpoint-250000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-250000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-250000/rng_state_7.pth b/checkpoint-250000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-250000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-250000/scheduler.pt b/checkpoint-250000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..df23b84c6e38f9bdf3e948cd3260239f58323cd5 --- /dev/null +++ b/checkpoint-250000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5a4dc0b52f7765974e350905f9eecbc8b067345f78042e2c445d94c45dbdd0 +size 1064 diff --git a/checkpoint-254696/config.json b/checkpoint-254696/config.json new file mode 100644 index 0000000000000000000000000000000000000000..722cb2561cfe2a0572ccda4672cda8629d888fc8 --- /dev/null +++ b/checkpoint-254696/config.json @@ -0,0 +1,39 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.2", + "use_cache": true, + "vocab_size": 156939 +} diff --git a/checkpoint-254696/generation_config.json b/checkpoint-254696/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a57f0b23b323fc7fb54fe90db52f4a1880670f63 --- /dev/null +++ b/checkpoint-254696/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.51.2" +} diff --git a/checkpoint-254696/model.safetensors.index.json b/checkpoint-254696/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..607115229c4b7e85e9ebb2aed91096b190e09c39 --- /dev/null +++ b/checkpoint-254696/model.safetensors.index.json @@ -0,0 +1,154 @@ +{ + "metadata": { + "total_size": 6463873024 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-254696/rng_state_0.pth b/checkpoint-254696/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-254696/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-254696/rng_state_1.pth b/checkpoint-254696/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-254696/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-254696/rng_state_2.pth b/checkpoint-254696/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-254696/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-254696/rng_state_3.pth b/checkpoint-254696/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-254696/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-254696/rng_state_4.pth b/checkpoint-254696/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-254696/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-254696/rng_state_5.pth b/checkpoint-254696/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-254696/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-254696/rng_state_6.pth b/checkpoint-254696/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-254696/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-254696/rng_state_7.pth b/checkpoint-254696/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-254696/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-254696/scheduler.pt b/checkpoint-254696/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..93712b8c1999ad04a7dcfe70944abd5eba866bb0 --- /dev/null +++ b/checkpoint-254696/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f84169eb775160af62b9d3a80c38a0c56e21478615a5c77531d2dbf81105c5 +size 1064 diff --git a/checkpoint-30000/rng_state_0.pth b/checkpoint-30000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-30000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-30000/rng_state_2.pth b/checkpoint-30000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-30000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-30000/rng_state_4.pth b/checkpoint-30000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-30000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-30000/rng_state_7.pth b/checkpoint-30000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-30000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-40000/rng_state_0.pth b/checkpoint-40000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-40000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-40000/rng_state_2.pth b/checkpoint-40000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-40000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-40000/rng_state_4.pth b/checkpoint-40000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-40000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-40000/scheduler.pt b/checkpoint-40000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..13bd8f277f846ca5bea117f55db3c02ad215dc46 --- /dev/null +++ b/checkpoint-40000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da9edcfcf30206aea950825cac456c47b84818b824029bea731e9f843cb7796 +size 1064 diff --git a/checkpoint-50000/rng_state_0.pth b/checkpoint-50000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-50000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-50000/rng_state_5.pth b/checkpoint-50000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-50000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-50000/scheduler.pt b/checkpoint-50000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..843c4668c4d73b0a750f046be52f3194b04db1d9 --- /dev/null +++ b/checkpoint-50000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e52a56c4c35f63ebeca7689aa9ee2bd6e469659f5b38802b9089f3fad5a4b341 +size 1064 diff --git a/checkpoint-60000/rng_state_2.pth b/checkpoint-60000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-60000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-60000/rng_state_3.pth b/checkpoint-60000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-60000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-60000/scheduler.pt b/checkpoint-60000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba15750ebfddf40d737aea42e398e578e614362c --- /dev/null +++ b/checkpoint-60000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:707de3f3b207bc838554cf55627e341addbae50cf3222ea5869db7fd6e6b13b8 +size 1064 diff --git a/checkpoint-70000/rng_state_0.pth b/checkpoint-70000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-70000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-70000/rng_state_2.pth b/checkpoint-70000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-70000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-70000/rng_state_7.pth b/checkpoint-70000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-70000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-70000/scheduler.pt b/checkpoint-70000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f00213ecd59fa6e2aa4cab39a27b8687d13c57e --- /dev/null +++ b/checkpoint-70000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e2fb9d18d9c712ed343710a8e86f2de08fcf1020dafb64b446bb05d60f094ab +size 1064 diff --git a/checkpoint-70000/trainer_state.json b/checkpoint-70000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..293bf206a6d0e5e910fa5f214b71517919f84468 --- /dev/null +++ b/checkpoint-70000/trainer_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9b016afe8cdf95eb5b996ea3b5dd122dca9486a4b965f761b43c8f9cc1e1db +size 12293083 diff --git a/checkpoint-80000/rng_state_0.pth b/checkpoint-80000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-80000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-80000/rng_state_2.pth b/checkpoint-80000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-80000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-80000/rng_state_4.pth b/checkpoint-80000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-80000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-80000/rng_state_7.pth b/checkpoint-80000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-80000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-80000/scheduler.pt b/checkpoint-80000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbbfbf9cc86b60db96efe82bf4f4af34952f56bf --- /dev/null +++ b/checkpoint-80000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e83c71081ba1dede7e900133697c9bdb09a869fed4638c81cd97b2cab630ca2d +size 1064 diff --git a/checkpoint-90000/rng_state_0.pth b/checkpoint-90000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-90000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-90000/rng_state_2.pth b/checkpoint-90000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-90000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-90000/rng_state_4.pth b/checkpoint-90000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-90000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-90000/scheduler.pt b/checkpoint-90000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..99ad3d132bb2db280d3827c5cb885ea49e8a1a81 --- /dev/null +++ b/checkpoint-90000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96df218e1843288accca7125b33ec1730ce1b60891dbf9908d50225d48d52cd +size 1064