Upload folder using huggingface_hub
Browse files- .gitattributes +9 -0
- distributed/.metadata +3 -0
- distributed/__0_0.distcp +3 -0
- distributed/__1_0.distcp +3 -0
- distributed/__2_0.distcp +3 -0
- distributed/__3_0.distcp +3 -0
- distributed/__4_0.distcp +3 -0
- distributed/__5_0.distcp +3 -0
- distributed/__6_0.distcp +3 -0
- distributed/__7_0.distcp +3 -0
- distributed/params.json +1 -0
- distributed/train_state_00000.json +1 -0
- distributed/train_state_00001.json +1 -0
- distributed/train_state_00002.json +1 -0
- distributed/train_state_00003.json +1 -0
- distributed/train_state_00004.json +1 -0
- distributed/train_state_00005.json +1 -0
- distributed/train_state_00006.json +1 -0
- distributed/train_state_00007.json +1 -0
.gitattributes
CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
distributed/.metadata filter=lfs diff=lfs merge=lfs -text
|
37 |
+
distributed/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
|
38 |
+
distributed/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
|
39 |
+
distributed/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
|
40 |
+
distributed/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
|
41 |
+
distributed/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
|
42 |
+
distributed/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
43 |
+
distributed/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
|
44 |
+
distributed/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
|
distributed/.metadata
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1175bd731a38b392be86094c2afdbf6cbe445f3e2f5b93c3610a5239ea8359c6
|
3 |
+
size 1148768
|
distributed/__0_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d2d67655cf6ebf60c66b5cd2c3779d294c477f2d9047694d0b06e3496be63e4
|
3 |
+
size 7858746576
|
distributed/__1_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83664ca307790cdaa18243827bcc6efa4a7217cbe3074a100c4e7290c76f88e5
|
3 |
+
size 7858785516
|
distributed/__2_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a500423f6248841f557a3ad4df66523199f3373705e104590240173b62dde21
|
3 |
+
size 7858785516
|
distributed/__3_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fbc7a3681191e5dc37f11e771c76075a69b8fd8426f822d3898c68fced49363
|
3 |
+
size 7858785516
|
distributed/__4_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ec1de706be9a639657f0b58eba56e16d464368c24e1c8f79d2948128a4d5935
|
3 |
+
size 7858785516
|
distributed/__5_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c69b5ed451fe2f6bd97ce997fc6698f130d494163f975412f6dafe79420ec91
|
3 |
+
size 7858787792
|
distributed/__6_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13f0bc192f5ed723c7ebe32222ae1208005076d408ea9cbbe69650e576ca84c8
|
3 |
+
size 7858787792
|
distributed/__7_0.distcp
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80aa3ab4f5e24ae317ed2176e4a1c2d97a293deee8a2108c08fca04259224864
|
3 |
+
size 7858550288
|
distributed/params.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"name": "supertokendump", "dump_dir": "/fsx/craffel/lingua_logs/llama1b_supertokendump/", "seed": 777, "grad_acc_steps": 1, "gc_collect_freq": 1000, "probe_freq": null, "steps": 60000, "data": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "bytes", "path": null, "n_words": 965331}}, "optim": {"lr": 0.003, "weight_decay": 0.033, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 5000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 965331, "weight_tying": false, "sliding_window": null}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 2500, "keep": -1}, "eval": {"every": 5000, "keep": -1}, "path": "/fsx/craffel/lingua_logs/llama1b_supertokendump/checkpoints", "init_ckpt_path": null, "load_init_optimizer_stages": false, "save_init_ckpt": true}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 8, "eval": {"harness": {"tasks": ["hellaswag", {"task": "boolq", "dataset_kwargs": {"trust_remote_code": true}}, "piqa", {"task": "social_iqa", "dataset_kwargs": {"trust_remote_code": true}}, "winogrande", "openbookqa", "arc_easy", "arc_challenge", "race", "commonsense_qa", {"task": "copa", "dataset_kwargs": {"trust_remote_code": true}}, "mmlu", "mmlu_pro"]}, "generator": {"max_tokens": 8192, "dtype": "bf16"}}}
|
distributed/train_state_00000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.36.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 32384675865282854236158560040517811020, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 208364704594901935986203130224135190747, "inc": 257317082376085721142933171929815648017}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00001.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.00.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 338428155998303934471243505700584626434, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 165228453752528696984315791531667374472, "inc": 173555323965545256606922338259303677603}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00002.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.35.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 327325618392808797324907796132741658836, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 218736872248900330673108710359553704344, "inc": 319170006889470250209362588441616495209}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00003.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.01.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 150756230386790219992322779186012672342, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 234772049952971506209572702348597106054, "inc": 115810872492597857501795428972873905393}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00004.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.37.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 52010840950823229593986838343430613540, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 169112684582997526137434533734107705285, "inc": 303111205818808944921858206842105131807}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00005.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.02.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 102699060900499979923375209019788435132, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 260572155834394800131214637673749996378, "inc": 47382953940698287647753879262736142901}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00006.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.38.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 232471632667331820145564819885404856715, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 169925910427912386056949567841398549015, "inc": 72545526324180839152750112646078969085}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|
distributed/train_state_00007.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 0, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 0, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/", "sources": {"fineweb_edu_10bt_shuffled": 1.0}, "source_to_state": {"fineweb_edu_10bt_shuffled": {"file_path": "/scratch/craffel/lingua/data/fineweb_edu_10bt_shuffled/fineweb_edu_10bt.chunk.03.jsonl", "position": 0, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 118735814132260855315664675830198688556, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "bytes", "path": null}, "output_seq_len": 4096, "n_views": 2, "seq_len": 0}, "seq_idx": 0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 179806467733754435976001987079330798053, "inc": 19761753544780285878460645500694854795}, "has_uint32": 0, "uinteger": 0}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.003], "last_epoch": 0, "verbose": false, "_step_count": 1, "_get_lr_called_within_step": false, "_last_lr": [0.0], "lr_lambdas": [{}]}}
|